[WIP][VPlan based] Clamp VF range in VPlan transformation

Mel-Chen · Mel-Chen · commit a0b61ad94811 · 2025-05-21T01:11:33.000-07:00
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1329,15 +1329,6 @@ class LoopVectorizationCostModel {
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
-  /// Returns true if \p I is a memory instruction with strided memory access
-  /// that can be vectorized.
-  bool stridedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
-
-  /// Get the stride information of the strided memory accesses.
-  SmallDenseMap<Instruction *, int64_t> getStrideInfo() const {
-    return StrideInfo;
-  }
-
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
   bool requiresScalarEpilogue(bool IsVectorizing) const {
@@ -1592,10 +1583,6 @@ class LoopVectorizationCostModel {
   /// element)
   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
 
-  /// The cost computation for strided load/store instruction.
-  InstructionCost getStridedLoadStoreCost(Instruction *I,
-                                          ElementCount VF) const;
-
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
   InstructionCost getScalarizationOverhead(Instruction *I,
@@ -1735,9 +1722,6 @@ class LoopVectorizationCostModel {
         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
   }
 
-  /// The mapping of memory access instructions to their stride values.
-  SmallDenseMap<Instruction *, int64_t> StrideInfo;
-
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -3295,31 +3279,6 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   return true;
 }
 
-bool LoopVectorizationCostModel::stridedAccessCanBeWidened(
-    Instruction *I, ElementCount VF) const {
-  // Get and ensure we have a valid memory instruction.
-  assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
-
-  // Only support strided access for vector VF.
-  if (!VF.isVector())
-    return false;
-
-  // FIXME: Remove this check for StoreInst after strided store is supported.
-  if (isa<StoreInst>(I))
-    return false;
-
-  [[maybe_unused]] auto *Ptr = getLoadStorePointerOperand(I);
-  auto *ScalarTy = getLoadStoreType(I);
-  // TODO: Support non-unit-reverse strided accesses. Add stride analysis here
-  // to ensure that the accessed addresses are evenly spaced apart by a fixed
-  // stride.
-  assert(Legal->isConsecutivePtr(ScalarTy, Ptr) == -1 &&
-         "Only supports strided accesses with a stride of -1");
-
-  const Align Alignment = getLoadStoreAlignment(I);
-  return TTI.isLegalStridedLoadStore(toVectorTy(ScalarTy, VF), Alignment);
-}
-
 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // We should not collect Uniforms more than once per VF. Right now,
   // this function is called from collectUniformsAndScalars(), which
@@ -5723,19 +5682,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   return Cost;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getStridedLoadStoreCost(Instruction *I,
-                                                    ElementCount VF) const {
-  Type *ValTy = getLoadStoreType(I);
-  auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
-  const Align Alignment = getLoadStoreAlignment(I);
-  const Value *Ptr = getLoadStorePointerOperand(I);
-
-  return TTI.getStridedMemoryOpCost(I->getOpcode(), VectorTy, Ptr,
-                                    Legal->isMaskRequired(I), Alignment,
-                                    CostKind, I);
-}
-
 std::optional<InstructionCost>
 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
                                                     ElementCount VF,
@@ -6055,17 +6001,6 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
                "Expected consecutive stride.");
         InstWidening Decision =
             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
-        // Consider using strided load/store for consecutive reverse accesses to
-        // achieve more efficient memory operations.
-        if (ConsecutiveStride == -1 && stridedAccessCanBeWidened(&I, VF)) {
-          const InstructionCost StridedLoadStoreCost =
-              getStridedLoadStoreCost(&I, VF);
-          if (StridedLoadStoreCost < Cost) {
-            Decision = CM_Strided;
-            Cost = StridedLoadStoreCost;
-            StrideInfo[&I] = ConsecutiveStride;
-          }
-        }
         setWideningDecision(&I, VF, Decision, Cost);
         continue;
       }
@@ -9478,12 +9413,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
   // clamp the range for better cost estimation.
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
-  if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
-                          CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+                        CM.CostKind);
+  if (!CM.foldTailWithEVL())
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
-  }
+
+  // !!! NEED COMMENT
+  VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
+                           CostCtx, Range);
 
   for (ElementCount VF : Range)
     Plan->addVF(VF);
@@ -9495,9 +9433,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
   VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
                            InterleaveGroups, RecipeBuilder,
                            CM.isScalarEpilogueAllowed());
-  // !!! NEED COMMENT
-  VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
-                           CM.getStrideInfo());
 
   // Replace VPValues for known constant strides guaranteed by predicate scalar
   // evolution.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1718,6 +1718,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
 
   VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
 
+  VPValue *getPtr() const { return getOperand(0); }
+
   VPValue *getVFValue() { return getOperand(1); }
   const VPValue *getVFValue() const { return getOperand(1); }
 
@@ -3161,10 +3163,6 @@ struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
   /// Generate a strided load.
   void execute(VPTransformState &State) override;
 
-  /// Return the cost of this VPWidenStridedLoadRecipe.
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2316,7 +2316,7 @@ void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
       ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
   // LastLane = 1 - RunTimeVF
   Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
-  Value *Ptr = State.get(getOperand(0), VPLane(0));
+  Value *Ptr = State.get(getPtr(), VPLane(0));
   Value *ResultPtr =
       Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
   ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
@@ -2923,9 +2923,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                     ->getAddressSpace();
-  unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
-                        ? Instruction::Load
-                        : Instruction::Store;
+  unsigned Opcode =
+      isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+          this)
+          ? Instruction::Load
+          : Instruction::Store;
 
   if (!Consecutive) {
     // TODO: Using the original IR may not be accurate.
@@ -2934,6 +2936,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
     assert(!Reverse &&
            "Inconsecutive memory access should not have the order.");
+
+    if (isa<VPWidenStridedLoadRecipe>(this))
+      return Ctx.TTI.getStridedMemoryOpCost(
+          Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
+
     return Ctx.TTI.getAddressComputationCost(Ty) +
            Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
                                           Ctx.CostKind, &Ingredient);
@@ -3128,18 +3135,6 @@ void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-InstructionCost
-VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
-                                      VPCostContext &Ctx) const {
-  Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
-  const Align Alignment = getLoadStoreAlignment(&Ingredient);
-  const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
-
-  return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, Ptr,
-                                        IsMasked, Alignment, Ctx.CostKind,
-                                        &Ingredient);
-}
-
 void VPWidenStoreRecipe::execute(VPTransformState &State) {
   VPValue *StoredVPValue = getStoredValue();
   bool CreateScatter = !isConsecutive();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2517,48 +2517,68 @@ void VPlanTransforms::createInterleaveGroups(
   }
 }
 
-void VPlanTransforms::convertToStridedAccesses(
-    VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo) {
-  // !!! FIXME: Should remove StrideInfo for next step.
-  if (Plan.hasScalarVFOnly() || StrideInfo.empty())
+void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
+                                               VFRange &Range) {
+  if (Plan.hasScalarVFOnly())
     return;
 
-  // !!! FIXME: Should clamp VF for legal and cost in next step
   SmallVector<VPRecipeBase *> ToErase;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      // !!! FIXME: Should use LoadR->isReverse() for next step
-      if (auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
-          LoadR && !LoadR->isConsecutive()) {
-        auto *LI = cast<LoadInst>(&LoadR->getIngredient());
-        auto It = StrideInfo.find(LI);
-        if (It == StrideInfo.end())
-          continue;
-        int64_t Stride = It->second;
-        assert(Stride == -1 &&
-               "Only stride memory access with a stride of -1 is supported.");
-        // !!! FIXME: Should get VPVectorEndPointerRecipe for reverse
-        VPValue *Ptr = LoadR->getAddr();
-        auto *GEP = dyn_cast<GetElementPtrInst>(
-            Ptr->getUnderlyingValue()->stripPointerCasts());
-        auto *NewPtr = new VPVectorPointerRecipe(
-            Ptr, getLoadStoreType(LI), /*Stride*/ true,
-            GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
-            LoadR->getDebugLoc());
-        NewPtr->insertBefore(LoadR);
-
-        const DataLayout &DL = LI->getDataLayout();
-        auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
-        VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
-            StrideTy, Stride * DL.getTypeAllocSize(getLoadStoreType(LI))));
-        auto *StridedLoad = new VPWidenStridedLoadRecipe(
-            *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
-            LoadR->getDebugLoc());
-        StridedLoad->insertBefore(LoadR);
-        LoadR->replaceAllUsesWith(StridedLoad);
-        ToErase.push_back(LoadR);
-      }
+      auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
+      // TODO: support strided store
+      // TODO: support strided accesses with stride not equal to -1
+      if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || !MemR->isReverse())
+        continue;
+
+      Instruction &Ingredient = MemR->getIngredient();
+      Type *ElementTy = getLoadStoreType(&Ingredient);
+
+      auto IsProfitable = [&](ElementCount VF) -> bool {
+        Type *DataTy = toVectorTy(ElementTy, VF);
+        const Align Alignment = getLoadStoreAlignment(&Ingredient);
+        if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
+          return false;
+        const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
+        const InstructionCost StridedLoadStoreCost =
+            Ctx.TTI.getStridedMemoryOpCost(
+                Ingredient.getOpcode(), DataTy,
+                getLoadStorePointerOperand(&Ingredient), MemR->isMasked(),
+                Alignment, Ctx.CostKind, &Ingredient);
+        return StridedLoadStoreCost < CurrentCost;
+      };
+
+      if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
+                                                              Range))
+        continue;
+
+      // The stride of consecutive reverse access must be -1.
+      int64_t Stride = -1;
+      auto *VecEndPtr = cast<VPVectorEndPointerRecipe>(MemR->getAddr());
+      VPValue *Ptr = VecEndPtr->getPtr();
+      auto *GEP = dyn_cast<GetElementPtrInst>(
+          Ptr->getUnderlyingValue()->stripPointerCasts());
+      // Create a new vector pointer for strided access.
+      auto *NewPtr = new VPVectorPointerRecipe(
+          Ptr, ElementTy, /*Stride=*/ true,
+          GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(),
+          VecEndPtr->getDebugLoc());
+      NewPtr->insertBefore(MemR);
+
+      auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
+      auto *LI = cast<LoadInst>(&Ingredient);
+      const DataLayout &DL = LI->getDataLayout();
+      auto *StrideTy = DL.getIndexType(LI->getPointerOperand()->getType());
+      VPValue *StrideVPV = Plan.getOrAddLiveIn(ConstantInt::get(
+          StrideTy, Stride * DL.getTypeAllocSize(ElementTy)));
+      auto *StridedLoad = new VPWidenStridedLoadRecipe(
+          *LI, NewPtr, StrideVPV, &Plan.getVF(), LoadR->getMask(), *LoadR,
+          LoadR->getDebugLoc());
+      StridedLoad->insertBefore(LoadR);
+      LoadR->replaceAllUsesWith(StridedLoad);
+
+      ToErase.append({LoadR, VecEndPtr});
     }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -172,8 +172,8 @@ struct VPlanTransforms {
       VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
 
   // !!! NEED COMMENT
-  static void convertToStridedAccesses(
-      VPlan &Plan, const SmallDenseMap<Instruction *, int64_t> &StrideInfo);
+  static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
+                                       VFRange &Range);
 
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll