Skip to content

Commit 979400b

Browse files
committed
[ARM] Fix MVE gather/scatter merged gep offsets
This fixes the combining of constant vector GEP operands in the optimization of MVE gather/scatter addresses, when opaque pointers are enabled. As opaque pointers reduce the number of bitcasts between geps, more can be folded than before. This can cause problems if the index types are now different between the two geps. This fixes that by making sure each constant is scaled appropriately, which has the effect of transforming the geps to have a scale of 1, changing [r0, q0, uxtw #1] gathers to [r0, q0] with a larger q0. This helps use a simpler instruction that doesn't need the extra uxtw. Differential Revision: https://reviews.llvm.org/D127733
1 parent f986976 commit 979400b

7 files changed

+169
-150
lines changed

llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,8 @@ class MVEGatherScatterLowering : public FunctionPass {
145145
// Optimise the base and offsets of the given address
146146
bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
147147
// Try to fold consecutive geps together into one
148-
Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
148+
Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, unsigned &Scale,
149+
IRBuilder<> &Builder);
149150
// Check whether these offsets could be moved out of the loop they're in
150151
bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
151152
// Pushes the given add out of the loop
@@ -1103,8 +1104,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
11031104
return true;
11041105
}
11051106

1106-
static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
1107-
IRBuilder<> &Builder) {
1107+
static Value *CheckAndCreateOffsetAdd(Value *X, unsigned ScaleX, Value *Y,
1108+
unsigned ScaleY, IRBuilder<> &Builder) {
11081109
// Splat the non-vector value to a vector of the given type - if the value is
11091110
// a constant (and its value isn't too big), we can even use this opportunity
11101111
// to scale it to the size of the vector elements
@@ -1156,40 +1157,49 @@ static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
11561157
ConstantInt *ConstYEl =
11571158
dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
11581159
if (!ConstXEl || !ConstYEl ||
1159-
ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
1160+
ConstXEl->getZExtValue() * ScaleX +
1161+
ConstYEl->getZExtValue() * ScaleY >=
11601162
(unsigned)(1 << (TargetElemSize - 1)))
11611163
return nullptr;
11621164
}
11631165
}
11641166

1165-
Value *Add = Builder.CreateAdd(X, Y);
1167+
Value *XScale = Builder.CreateVectorSplat(
1168+
XElType->getNumElements(),
1169+
Builder.getIntN(XElType->getScalarSizeInBits(), ScaleX));
1170+
Value *YScale = Builder.CreateVectorSplat(
1171+
YElType->getNumElements(),
1172+
Builder.getIntN(YElType->getScalarSizeInBits(), ScaleY));
1173+
Value *Add = Builder.CreateAdd(Builder.CreateMul(X, XScale),
1174+
Builder.CreateMul(Y, YScale));
11661175

1167-
FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
1168-
if (checkOffsetSize(Add, GEPType->getNumElements()))
1176+
if (checkOffsetSize(Add, XElType->getNumElements()))
11691177
return Add;
11701178
else
11711179
return nullptr;
11721180
}
11731181

11741182
Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
1175-
Value *&Offsets,
1183+
Value *&Offsets, unsigned &Scale,
11761184
IRBuilder<> &Builder) {
11771185
Value *GEPPtr = GEP->getPointerOperand();
11781186
Offsets = GEP->getOperand(1);
1187+
Scale = DL->getTypeAllocSize(GEP->getSourceElementType());
11791188
// We only merge geps with constant offsets, because only for those
11801189
// we can make sure that we do not cause an overflow
1181-
if (!isa<Constant>(Offsets))
1190+
if (GEP->getNumIndices() != 1 || !isa<Constant>(Offsets))
11821191
return nullptr;
1183-
GetElementPtrInst *BaseGEP;
1184-
if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
1192+
if (GetElementPtrInst *BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr)) {
11851193
// Merge the two geps into one
1186-
Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
1194+
Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Scale, Builder);
11871195
if (!BaseBasePtr)
11881196
return nullptr;
1189-
Offsets =
1190-
CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
1197+
Offsets = CheckAndCreateOffsetAdd(
1198+
Offsets, Scale, GEP->getOperand(1),
1199+
DL->getTypeAllocSize(GEP->getSourceElementType()), Builder);
11911200
if (Offsets == nullptr)
11921201
return nullptr;
1202+
Scale = 1; // Scale is always an i8 at this point.
11931203
return BaseBasePtr;
11941204
}
11951205
return GEPPtr;
@@ -1206,15 +1216,24 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
12061216
Builder.SetInsertPoint(GEP);
12071217
Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
12081218
Value *Offsets;
1209-
Value *Base = foldGEP(GEP, Offsets, Builder);
1219+
unsigned Scale;
1220+
Value *Base = foldGEP(GEP, Offsets, Scale, Builder);
12101221
// We only want to merge the geps if there is a real chance that they can be
12111222
// used by an MVE gather; thus the offset has to have the correct size
12121223
// (always i32 if it is not of vector type) and the base has to be a
12131224
// pointer.
12141225
if (Offsets && Base && Base != GEP) {
1226+
assert(Scale == 1 && "Expected to fold GEP to a scale of 1");
1227+
Type *BaseTy = Builder.getInt8PtrTy();
1228+
if (auto *VecTy = dyn_cast<FixedVectorType>(Base->getType()))
1229+
BaseTy = FixedVectorType::get(BaseTy, VecTy);
12151230
GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
1216-
GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP);
1217-
GEP->replaceAllUsesWith(NewAddress);
1231+
Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets,
1232+
"gep.merged", GEP);
1233+
LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP
1234+
<< "\n new : " << *NewAddress << "\n");
1235+
GEP->replaceAllUsesWith(
1236+
Builder.CreateBitCast(NewAddress, GEP->getType()));
12181237
GEP = NewAddress;
12191238
Changed = true;
12201239
}

llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -294,19 +294,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>*
294294
; CHECK: @ %bb.0: @ %entry
295295
; CHECK-NEXT: adr r1, .LCPI14_0
296296
; CHECK-NEXT: vldrw.u32 q1, [r1]
297-
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
297+
; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
298298
; CHECK-NEXT: bx lr
299299
; CHECK-NEXT: .p2align 4
300300
; CHECK-NEXT: @ %bb.1:
301301
; CHECK-NEXT: .LCPI14_0:
302-
; CHECK-NEXT: .short 20 @ 0x14
303-
; CHECK-NEXT: .short 23 @ 0x17
304-
; CHECK-NEXT: .short 26 @ 0x1a
305-
; CHECK-NEXT: .short 29 @ 0x1d
306-
; CHECK-NEXT: .short 32 @ 0x20
307-
; CHECK-NEXT: .short 35 @ 0x23
308-
; CHECK-NEXT: .short 38 @ 0x26
309-
; CHECK-NEXT: .short 41 @ 0x29
302+
; CHECK-NEXT: .short 40 @ 0x28
303+
; CHECK-NEXT: .short 46 @ 0x2e
304+
; CHECK-NEXT: .short 52 @ 0x34
305+
; CHECK-NEXT: .short 58 @ 0x3a
306+
; CHECK-NEXT: .short 64 @ 0x40
307+
; CHECK-NEXT: .short 70 @ 0x46
308+
; CHECK-NEXT: .short 76 @ 0x4c
309+
; CHECK-NEXT: .short 82 @ 0x52
310310
entry:
311311
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21>
312312
%ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20
@@ -319,19 +319,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) {
319319
; CHECK: @ %bb.0: @ %entry
320320
; CHECK-NEXT: adr r1, .LCPI15_0
321321
; CHECK-NEXT: vldrw.u32 q1, [r1]
322-
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
322+
; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
323323
; CHECK-NEXT: bx lr
324324
; CHECK-NEXT: .p2align 4
325325
; CHECK-NEXT: @ %bb.1:
326326
; CHECK-NEXT: .LCPI15_0:
327-
; CHECK-NEXT: .short 20 @ 0x14
328-
; CHECK-NEXT: .short 23 @ 0x17
329-
; CHECK-NEXT: .short 26 @ 0x1a
330-
; CHECK-NEXT: .short 29 @ 0x1d
331-
; CHECK-NEXT: .short 32 @ 0x20
332-
; CHECK-NEXT: .short 35 @ 0x23
333-
; CHECK-NEXT: .short 38 @ 0x26
334-
; CHECK-NEXT: .short 41 @ 0x29
327+
; CHECK-NEXT: .short 40 @ 0x28
328+
; CHECK-NEXT: .short 46 @ 0x2e
329+
; CHECK-NEXT: .short 52 @ 0x34
330+
; CHECK-NEXT: .short 58 @ 0x3a
331+
; CHECK-NEXT: .short 64 @ 0x40
332+
; CHECK-NEXT: .short 70 @ 0x46
333+
; CHECK-NEXT: .short 76 @ 0x4c
334+
; CHECK-NEXT: .short 82 @ 0x52
335335
entry:
336336
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
337337
%ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20

llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -318,15 +318,15 @@ define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base) {
318318
; CHECK: @ %bb.0: @ %entry
319319
; CHECK-NEXT: adr r1, .LCPI21_0
320320
; CHECK-NEXT: vldrw.u32 q1, [r1]
321-
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
321+
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
322322
; CHECK-NEXT: bx lr
323323
; CHECK-NEXT: .p2align 4
324324
; CHECK-NEXT: @ %bb.1:
325325
; CHECK-NEXT: .LCPI21_0:
326-
; CHECK-NEXT: .long 5 @ 0x5
327-
; CHECK-NEXT: .long 8 @ 0x8
328-
; CHECK-NEXT: .long 11 @ 0xb
329-
; CHECK-NEXT: .long 14 @ 0xe
326+
; CHECK-NEXT: .long 20 @ 0x14
327+
; CHECK-NEXT: .long 32 @ 0x20
328+
; CHECK-NEXT: .long 44 @ 0x2c
329+
; CHECK-NEXT: .long 56 @ 0x38
330330
entry:
331331
%ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
332332
%ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5

llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -856,15 +856,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i8(i8* %base) {
856856
; CHECK-OPAQ: @ %bb.0:
857857
; CHECK-OPAQ-NEXT: adr r1, .LCPI31_0
858858
; CHECK-OPAQ-NEXT: vldrw.u32 q1, [r1]
859-
; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
859+
; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1]
860860
; CHECK-OPAQ-NEXT: bx lr
861861
; CHECK-OPAQ-NEXT: .p2align 4
862862
; CHECK-OPAQ-NEXT: @ %bb.1:
863863
; CHECK-OPAQ-NEXT: .LCPI31_0:
864-
; CHECK-OPAQ-NEXT: .long 4294967295 @ 0xffffffff
865-
; CHECK-OPAQ-NEXT: .long 15 @ 0xf
866-
; CHECK-OPAQ-NEXT: .long 31 @ 0x1f
867-
; CHECK-OPAQ-NEXT: .long 47 @ 0x2f
864+
; CHECK-OPAQ-NEXT: .long 4294967292 @ 0xfffffffc
865+
; CHECK-OPAQ-NEXT: .long 12 @ 0xc
866+
; CHECK-OPAQ-NEXT: .long 28 @ 0x1c
867+
; CHECK-OPAQ-NEXT: .long 44 @ 0x2c
868868
%a = getelementptr i8, i8* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
869869
%b = bitcast <4 x i8*> %a to <4 x i32*>
870870
%c = getelementptr inbounds i32, <4 x i32*> %b, i32 -1
@@ -892,15 +892,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff3_i16(i16* %base) {
892892
; CHECK-OPAQ: @ %bb.0:
893893
; CHECK-OPAQ-NEXT: adr r1, .LCPI32_0
894894
; CHECK-OPAQ-NEXT: vldrw.u32 q1, [r1]
895-
; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
895+
; CHECK-OPAQ-NEXT: vldrw.u32 q0, [r0, q1]
896896
; CHECK-OPAQ-NEXT: bx lr
897897
; CHECK-OPAQ-NEXT: .p2align 4
898898
; CHECK-OPAQ-NEXT: @ %bb.1:
899899
; CHECK-OPAQ-NEXT: .LCPI32_0:
900-
; CHECK-OPAQ-NEXT: .long 15 @ 0xf
901-
; CHECK-OPAQ-NEXT: .long 5 @ 0x5
902-
; CHECK-OPAQ-NEXT: .long 29 @ 0x1d
903-
; CHECK-OPAQ-NEXT: .long 235 @ 0xeb
900+
; CHECK-OPAQ-NEXT: .long 12 @ 0xc
901+
; CHECK-OPAQ-NEXT: .long 18 @ 0x12
902+
; CHECK-OPAQ-NEXT: .long 58 @ 0x3a
903+
; CHECK-OPAQ-NEXT: .long 280 @ 0x118
904904
%a = getelementptr i16, i16* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
905905
%b = bitcast <4 x i16*> %a to <4 x i8*>
906906
%c = getelementptr i8, <4 x i8*> %b, <4 x i32> <i32 16, i32 -10, i32 -2, i32 188>

0 commit comments

Comments
 (0)