python3kgae
diff --git a/‎llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
Lines changed: 36 additions & 17 deletions b/‎llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
Lines changed: 36 additions & 17 deletions
diff --git a/‎llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
Lines changed: 18 additions & 18 deletions b/‎llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
Lines changed: 18 additions & 18 deletions
diff --git a/‎llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
Lines changed: 5 additions & 5 deletions b/‎llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll
Lines changed: 5 additions & 5 deletions
diff --git a/‎llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
Lines changed: 10 additions & 10 deletions b/‎llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
Lines changed: 10 additions & 10 deletions
@@ -145,7 +145,8 @@ class MVEGatherScatterLowering : public FunctionPass {
   // Optimise the base and offsets of the given address
   bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI);
   // Try to fold consecutive geps together into one
-  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder);
+  Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, unsigned &Scale,
+                 IRBuilder<> &Builder);
   // Check whether these offsets could be moved out of the loop they're in
   bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
   // Pushes the given add out of the loop
@@ -1103,8 +1104,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
   return true;
 }
 
-static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
-                                      IRBuilder<> &Builder) {
+static Value *CheckAndCreateOffsetAdd(Value *X, unsigned ScaleX, Value *Y,
+                                      unsigned ScaleY, IRBuilder<> &Builder) {
   // Splat the non-vector value to a vector of the given type - if the value is
   // a constant (and its value isn't too big), we can even use this opportunity
   // to scale it to the size of the vector elements
@@ -1156,40 +1157,49 @@ static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP,
       ConstantInt *ConstYEl =
           dyn_cast<ConstantInt>(ConstY->getAggregateElement(i));
       if (!ConstXEl || !ConstYEl ||
-          ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >=
+          ConstXEl->getZExtValue() * ScaleX +
+                  ConstYEl->getZExtValue() * ScaleY >=
               (unsigned)(1 << (TargetElemSize - 1)))
         return nullptr;
     }
   }
 
-  Value *Add = Builder.CreateAdd(X, Y);
+  Value *XScale = Builder.CreateVectorSplat(
+      XElType->getNumElements(),
+      Builder.getIntN(XElType->getScalarSizeInBits(), ScaleX));
+  Value *YScale = Builder.CreateVectorSplat(
+      YElType->getNumElements(),
+      Builder.getIntN(YElType->getScalarSizeInBits(), ScaleY));
+  Value *Add = Builder.CreateAdd(Builder.CreateMul(X, XScale),
+                                 Builder.CreateMul(Y, YScale));
 
-  FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType());
-  if (checkOffsetSize(Add, GEPType->getNumElements()))
+  if (checkOffsetSize(Add, XElType->getNumElements()))
     return Add;
   else
     return nullptr;
 }
 
 Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP,
-                                         Value *&Offsets,
+                                         Value *&Offsets, unsigned &Scale,
                                          IRBuilder<> &Builder) {
   Value *GEPPtr = GEP->getPointerOperand();
   Offsets = GEP->getOperand(1);
+  Scale = DL->getTypeAllocSize(GEP->getSourceElementType());
   // We only merge geps with constant offsets, because only for those
   // we can make sure that we do not cause an overflow
-  if (!isa<Constant>(Offsets))
+  if (GEP->getNumIndices() != 1 || !isa<Constant>(Offsets))
     return nullptr;
-  GetElementPtrInst *BaseGEP;
-  if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) {
+  if (GetElementPtrInst *BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr)) {
     // Merge the two geps into one
-    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder);
+    Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Scale, Builder);
     if (!BaseBasePtr)
       return nullptr;
-    Offsets =
-        CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder);
+    Offsets = CheckAndCreateOffsetAdd(
+        Offsets, Scale, GEP->getOperand(1),
+        DL->getTypeAllocSize(GEP->getSourceElementType()), Builder);
     if (Offsets == nullptr)
       return nullptr;
+    Scale = 1; // Scale is always an i8 at this point.
     return BaseBasePtr;
   }
   return GEPPtr;
@@ -1206,15 +1216,24 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
     Builder.SetInsertPoint(GEP);
     Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
     Value *Offsets;
-    Value *Base = foldGEP(GEP, Offsets, Builder);
+    unsigned Scale;
+    Value *Base = foldGEP(GEP, Offsets, Scale, Builder);
     // We only want to merge the geps if there is a real chance that they can be
     // used by an MVE gather; thus the offset has to have the correct size
     // (always i32 if it is not of vector type) and the base has to be a
     // pointer.
     if (Offsets && Base && Base != GEP) {
+      assert(Scale == 1 && "Expected to fold GEP to a scale of 1");
+      Type *BaseTy = Builder.getInt8PtrTy();
+      if (auto *VecTy = dyn_cast<FixedVectorType>(Base->getType()))
+        BaseTy = FixedVectorType::get(BaseTy, VecTy);
       GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
-          GEP->getSourceElementType(), Base, Offsets, "gep.merged", GEP);
-      GEP->replaceAllUsesWith(NewAddress);
+          Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets,
+          "gep.merged", GEP);
+      LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP
+                        << "\n      new :  " << *NewAddress << "\n");
+      GEP->replaceAllUsesWith(
+          Builder.CreateBitCast(NewAddress, GEP->getType()));
       GEP = NewAddress;
       Changed = true;
     }
 
@@ -294,19 +294,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>*
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI14_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrh.u16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI14_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21>
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20
@@ -319,19 +319,19 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI15_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrh.u16 q0, [r0, q1, uxtw #1]
+; CHECK-NEXT:    vldrh.u16 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI15_0:
-; CHECK-NEXT:    .short 20 @ 0x14
-; CHECK-NEXT:    .short 23 @ 0x17
-; CHECK-NEXT:    .short 26 @ 0x1a
-; CHECK-NEXT:    .short 29 @ 0x1d
-; CHECK-NEXT:    .short 32 @ 0x20
-; CHECK-NEXT:    .short 35 @ 0x23
-; CHECK-NEXT:    .short 38 @ 0x26
-; CHECK-NEXT:    .short 41 @ 0x29
+; CHECK-NEXT:    .short 40 @ 0x28
+; CHECK-NEXT:    .short 46 @ 0x2e
+; CHECK-NEXT:    .short 52 @ 0x34
+; CHECK-NEXT:    .short 58 @ 0x3a
+; CHECK-NEXT:    .short 64 @ 0x40
+; CHECK-NEXT:    .short 70 @ 0x46
+; CHECK-NEXT:    .short 76 @ 0x4c
+; CHECK-NEXT:    .short 82 @ 0x52
 entry:
   %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
   %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20
 
@@ -318,15 +318,15 @@ define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    adr r1, .LCPI21_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI21_0:
-; CHECK-NEXT:    .long 5 @ 0x5
-; CHECK-NEXT:    .long 8 @ 0x8
-; CHECK-NEXT:    .long 11 @ 0xb
-; CHECK-NEXT:    .long 14 @ 0xe
+; CHECK-NEXT:    .long 20 @ 0x14
+; CHECK-NEXT:    .long 32 @ 0x20
+; CHECK-NEXT:    .long 44 @ 0x2c
+; CHECK-NEXT:    .long 56 @ 0x38
 entry:
   %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5
 
@@ -856,15 +856,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i8(i8* %base) {
 ; CHECK-OPAQ:       @ %bb.0:
 ; CHECK-OPAQ-NEXT:    adr r1, .LCPI31_0
 ; CHECK-OPAQ-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-OPAQ-NEXT:    bx lr
 ; CHECK-OPAQ-NEXT:    .p2align 4
 ; CHECK-OPAQ-NEXT:  @ %bb.1:
 ; CHECK-OPAQ-NEXT:  .LCPI31_0:
-; CHECK-OPAQ-NEXT:    .long 4294967295 @ 0xffffffff
-; CHECK-OPAQ-NEXT:    .long 15 @ 0xf
-; CHECK-OPAQ-NEXT:    .long 31 @ 0x1f
-; CHECK-OPAQ-NEXT:    .long 47 @ 0x2f
+; CHECK-OPAQ-NEXT:    .long 4294967292 @ 0xfffffffc
+; CHECK-OPAQ-NEXT:    .long 12 @ 0xc
+; CHECK-OPAQ-NEXT:    .long 28 @ 0x1c
+; CHECK-OPAQ-NEXT:    .long 44 @ 0x2c
   %a = getelementptr i8, i8* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
   %b = bitcast <4 x i8*> %a to <4 x i32*>
   %c = getelementptr inbounds i32, <4 x i32*> %b, i32 -1
@@ -892,15 +892,15 @@ define arm_aapcs_vfpcc <4 x i32> @gepconstoff3_i16(i16* %base) {
 ; CHECK-OPAQ:       @ %bb.0:
 ; CHECK-OPAQ-NEXT:    adr r1, .LCPI32_0
 ; CHECK-OPAQ-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
+; CHECK-OPAQ-NEXT:    vldrw.u32 q0, [r0, q1]
 ; CHECK-OPAQ-NEXT:    bx lr
 ; CHECK-OPAQ-NEXT:    .p2align 4
 ; CHECK-OPAQ-NEXT:  @ %bb.1:
 ; CHECK-OPAQ-NEXT:  .LCPI32_0:
-; CHECK-OPAQ-NEXT:    .long 15 @ 0xf
-; CHECK-OPAQ-NEXT:    .long 5 @ 0x5
-; CHECK-OPAQ-NEXT:    .long 29 @ 0x1d
-; CHECK-OPAQ-NEXT:    .long 235 @ 0xeb
+; CHECK-OPAQ-NEXT:    .long 12 @ 0xc
+; CHECK-OPAQ-NEXT:    .long 18 @ 0x12
+; CHECK-OPAQ-NEXT:    .long 58 @ 0x3a
+; CHECK-OPAQ-NEXT:    .long 280 @ 0x118
   %a = getelementptr i16, i16* %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
   %b = bitcast <4 x i16*> %a to <4 x i8*>
   %c = getelementptr i8, <4 x i8*> %b, <4 x i32> <i32 16, i32 -10, i32 -2, i32 188>