llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Lines changed: 105 additions & 7 deletions b/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Lines changed: 105 additions & 7 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlan.h
Lines changed: 62 additions & 5 deletions b/‎llvm/lib/Transforms/Vectorize/VPlan.h
Lines changed: 62 additions & 5 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Lines changed: 4 additions & 2 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Lines changed: 4 additions & 2 deletions
@@ -1092,6 +1092,7 @@ class LoopVectorizationCostModel {
     CM_Widen_Reverse, // For consecutive accesses with stride -1.
     CM_Interleave,
     CM_GatherScatter,
+    CM_Strided,
     CM_Scalarize,
     CM_VectorCall,
     CM_IntrinsicCall
@@ -1325,6 +1326,20 @@ class LoopVectorizationCostModel {
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
+  /// Returns true if \p I is a memory instruction with strided memory access
+  /// that can be vectorized.
+  bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
+
+  /// Get the stride of the strided memory access instruction \p Instr. Return 0
+  /// if the instruction \p Instr is not considered for vectorization as a
+  /// strided memory access.
+  int64_t getStride(Instruction *Instr) const {
+    auto It = StrideInfo.find(Instr);
+    if (It != StrideInfo.end())
+      return It->second;
+    return 0;
+  }
+
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
   bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1579,6 +1594,10 @@ class LoopVectorizationCostModel {
   /// element)
   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
 
+  /// The cost computation for strided load/store instruction.
+  InstructionCost getStridedLoadStoreCost(Instruction *I,
+                                          ElementCount VF) const;
+
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
   InstructionCost getScalarizationOverhead(Instruction *I,
@@ -1718,6 +1737,9 @@ class LoopVectorizationCostModel {
         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
   }
 
+  /// The mapping of memory access instructions to their stride values.
+  DenseMap<Instruction *, int64_t> StrideInfo;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -3275,6 +3297,31 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   return true;
 }
 
+bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
+    Instruction *I, ElementCount VF) const {
+  // Get and ensure we have a valid memory instruction.
+  assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+
+  // Only support strided access for vector VF.
+  if (!VF.isVector())
+    return false;
+
+  // FIXME: Remove this check for StoreInst after strided store is supported.
+  if (isa<StoreInst>(I))
+    return false;
+
+  [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
+  auto *ScalarTy = getLoadStoreType(I);
+  // TODO: Support non-unit-reverse strided accesses. Add stride analysis here
+  // to ensure that the accessed addresses are evenly spaced apart by a fixed
+  // stride.
+  assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
+         "Only supports strided accesses with a stride of -1");
+
+  const Align Alignment = getLoadStoreAlignment(I);
+  return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
+}
+
 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // We should not collect Uniforms more than once per VF. Right now,
   // this function is called from collectUniformsAndScalars(), which
@@ -3365,9 +3412,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     if (IsUniformMemOpUse(I))
       return true;
 
-    return (WideningDecision == CM_Widen ||
-            WideningDecision == CM_Widen_Reverse ||
-            WideningDecision == CM_Interleave);
+    return (
+        WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+        WideningDecision == CM_Strided || WideningDecision == CM_Interleave);
   };
 
   // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -4205,7 +4252,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
                 [](const auto *R) { return Instruction::Select; })
             .Case<VPWidenStoreRecipe>(
                 [](const auto *R) { return Instruction::Store; })
-            .Case<VPWidenLoadRecipe>(
+            .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
                 [](const auto *R) { return Instruction::Load; })
             .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
                 [](const auto *R) { return Instruction::Call; })
@@ -4304,6 +4351,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPWidenPointerInductionSC:
       case VPDef::VPReductionPHISC:
       case VPDef::VPInterleaveSC:
+      case VPDef::VPWidenStridedLoadSC:
       case VPDef::VPWidenLoadEVLSC:
       case VPDef::VPWidenLoadSC:
       case VPDef::VPWidenStoreEVLSC:
@@ -5883,6 +5931,19 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   return Cost;
 }
 
+InstructionCost
+LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
+                                                    ElementCount VF) const {
+  Type *ValTy = getLoadStoreType(I);
+  auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
+  const Value *Ptr = getLoadStorePointerOperand(I);
+
+  return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment,
+                                    CostKind, I);
+}
+
 std::optional<InstructionCost>
 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
                                                     ElementCount VF,
@@ -6202,6 +6263,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
                "Expected consecutive stride.");
         InstWidening Decision =
             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+        // Consider using strided load/store for consecutive reverse accesses to
+        // achieve more efficient memory operations.
+        if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
+          const InstructionCost StridedLoadStoreCost =
+              getStridedLoadStoreCost(&I, VF);
+          if (StridedLoadStoreCost < Cost) {
+            Decision = CM_Strided;
+            Cost = StridedLoadStoreCost;
+            StrideInfo[&I] = ConsecutiveStride;
+          }
+        }
         setWideningDecision(&I, VF, Decision, Cost);
         continue;
       }
@@ -6853,6 +6925,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
         return TTI::CastContextHint::Normal;
 
       switch (getWideningDecision(I, VF)) {
+      // TODO: New CastContextHint for strided accesses.
+      case LoopVectorizationCostModel::CM_Strided:
       case LoopVectorizationCostModel::CM_GatherScatter:
         return TTI::CastContextHint::GatherScatter;
       case LoopVectorizationCostModel::CM_Interleave:
@@ -8424,16 +8498,27 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   // reverse consecutive.
   LoopVectorizationCostModel::InstWidening Decision =
       CM.getWideningDecision(I, Range.Start);
+
+  auto SameWiden = [&](ElementCount VF) -> bool {
+    return Decision == CM.getWideningDecision(I, VF);
+  };
+  bool ContainsWidenVF =
+      LoopVectorizationPlanner::getDecisionAndClampRange(SameWiden, Range);
+  assert(ContainsWidenVF &&
+         "At least widen the memory accesses by the Start VF.");
+
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+  bool Strided = Decision == LoopVectorizationCostModel::CM_Strided;
 
   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
-  if (Consecutive) {
+  if (Consecutive || Strided) {
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
     VPSingleDefRecipe *VectorPtr;
     if (Reverse) {
+      assert(!Strided && "Reverse and Strided are mutually exclusive.");
       // When folding the tail, we may compute an address that we don't in the
       // original scalar loop and it may not be inbounds. Drop Inbounds in that
       // case.
@@ -8444,17 +8529,30 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
       VectorPtr = new VPVectorEndPointerRecipe(
           Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
     } else {
-      VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+      VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), Strided,
                                             GEP ? GEP->getNoWrapFlags()
                                                 : GEPNoWrapFlags::none(),
                                             I->getDebugLoc());
     }
     Builder.insert(VectorPtr);
     Ptr = VectorPtr;
   }
-  if (LoadInst *Load = dyn_cast<LoadInst>(I))
+  if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
+    if (Strided) {
+      const DataLayout &DL = Load->getDataLayout();
+      auto *StrideTy = DL.getIndexType(Load->getPointerOperand()->getType());
+      int64_t Stride = CM.getStride(Load);
+      assert(Stride == -1 &&
+             "Only stride memory access with a stride of -1 is supported.");
+      VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
+          StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(Load))));
+      return new VPWidenStridedLoadRecipe(*Load, Ptr, StrideVPV, &Plan.getVF(),
+                                          Mask, VPIRMetadata(*Load, LVer),
+                                          I->getDebugLoc());
+    }
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
                                  VPIRMetadata(*Load, LVer), I->getDebugLoc());
+  }
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
 
@@ -542,6 +542,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPIRInstructionSC:
+    case VPRecipeBase::VPWidenStridedLoadSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
     case VPRecipeBase::VPWidenLoadSC:
     case VPRecipeBase::VPWidenStoreEVLSC:
@@ -1714,16 +1715,21 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
 };
 
 /// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// Supports both consecutive and reverse consecutive accesses.
+/// TODO: Support non-unit strided accesses .
 class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
                               public VPUnrollPartAccessor<1> {
   Type *IndexedTy;
 
+  /// Indicate whether to compute the pointer for strided memory accesses.
+  bool Strided;
+
 public:
-  VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags,
-                        DebugLoc DL)
+  VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool Strided,
+                        GEPNoWrapFlags GEPFlags, DebugLoc DL)
       : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
                             GEPFlags, DL),
-        IndexedTy(IndexedTy) {}
+        IndexedTy(IndexedTy), Strided(Strided) {}
 
   VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
 
@@ -1744,7 +1750,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
   }
 
   VPVectorPointerRecipe *clone() override {
-    return new VPVectorPointerRecipe(getOperand(0), IndexedTy,
+    return new VPVectorPointerRecipe(getOperand(0), IndexedTy, Strided,
                                      getGEPNoWrapFlags(), getDebugLoc());
   }
 
@@ -2740,7 +2746,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata {
     return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
-           R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
+           R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -2859,6 +2866,56 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   }
 };
 
+/// A recipe for strided load operations, using the base address, stride, and an
+/// optional mask. This recipe will generate an vp.strided.load intrinsic call
+/// to represent memory accesses with a fixed stride.
+struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
+                                        public VPValue {
+  VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
+                           VPValue *VF, VPValue *Mask,
+                           const VPIRMetadata &Metadata, DebugLoc DL)
+      : VPWidenMemoryRecipe(
+            VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
+            /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
+        VPValue(this, &Load) {
+    setMask(Mask);
+  }
+
+  VPWidenStridedLoadRecipe *clone() override {
+    return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+                                        getStride(), getVF(), getMask(), *this,
+                                        getDebugLoc());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
+
+  /// Return the stride operand.
+  VPValue *getStride() const { return getOperand(1); }
+
+  /// Return the VF operand.
+  VPValue *getVF() const { return getOperand(2); }
+
+  /// Generate a strided load.
+  void execute(VPTransformState &State) override;
+
+  /// Return the cost of this VPWidenStridedLoadRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return Op == getAddr() || Op == getStride() || Op == getVF();
+  }
+};
+
 /// A recipe for widening store operations, using the stored value, the address
 /// to store to and an optional mask.
 struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
 
@@ -160,8 +160,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
 }
 
 Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
-  assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
-         "Store recipes should not define any values");
+  assert(
+      (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+          R)) &&
+      "Store recipes should not define any values");
   return cast<LoadInst>(&R->getIngredient())->getType();
 }
Original file line number	Diff line number	Diff line change
`@@ -160,8 +160,10 @@ Type VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe R) {`
`160`	`160`	`}`
`161`	`161`
`162`	`162`	`Type VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe R) {`
`163`		`- assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&`
`164`		`- "Store recipes should not define any values");`
	`163`	`+ assert(`
	`164`	`+ (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(`
	`165`	`+ R)) &&`
	`166`	`+ "Store recipes should not define any values");`
`165`	`167`	`return cast<LoadInst>(&R->getIngredient())->getType();`
`166`	`168`	`}`
`167`	`169`