llvm · fhahn · Jun 4, 2024 · Oct 9, 2024 · Oct 9, 2024 · Nov 10, 2024
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7647,8 +7647,19 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
        (!VectorizingEpilogue && !ExpandedSCEVs)) &&
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
 
-  // TODO: Move to VPlan transform stage once the transition to the VPlan-based
-  // cost model is complete for better cost estimates.
+  bool IVUpdateMayOverflow =
+      !isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF);
+  TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+  bool WithoutRuntimeCheck =
+      Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+  // Use NUW for the induction increment if we proved that it won't overflow in
+  // the vector loop or when not folding the tail. In the latter case, we know
+  // that the canonical induction increment will not overflow as the vector trip
+  // count is >= increment and a multiple of the increment.
+  bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
+  // TODO: Move transforms to VPlan transform stage once the transition to the
+  // VPlan-based cost model is complete for better cost estimates.
+  VPlanTransforms::convertCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck);
   VPlanTransforms::unrollByUF(BestVPlan, BestUF,
                               OrigLoop->getHeader()->getContext());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
@@ -8902,29 +8913,26 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   }
 }
 
-// Add the necessary canonical IV and branch recipes required to control the
-// loop.
-static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
-                                  DebugLoc DL) {
+// Add the required canonical IV along with its loop branch, but w/o its
+// increment - which is introduced later.
+static void addCanonicalIV(VPlan &Plan, Type *IdxTy, DebugLoc DL) {
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   auto *StartV = Plan.getOrAddLiveIn(StartIdx);
 
   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
+  // TODO: Introduce a separate scalar phi recipe that can be used for codegen,
+  // turning VPCanonicalIVPHIRecipe into an 'abstract' recipe which cannot be
+  // executed directly.
   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
   Header->insert(CanonicalIVPHI, Header->begin());
 
-  VPBuilder Builder(TopRegion->getExitingBasicBlock());
-  // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
-  auto *CanonicalIVIncrement = Builder.createOverflowingOp(
-      Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
-      "index.next");
-  CanonicalIVPHI->addOperand(CanonicalIVIncrement);
-
   // Add the BranchOnCount VPInstruction to the latch.
+  VPBuilder Builder(TopRegion->getExitingBasicBlock());
+  // TODO: introduce branch-on-count during VPlan final (pre-codegen) lowering.
   Builder.createNaryOp(VPInstruction::BranchOnCount,
-                       {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+                       {CanonicalIVPHI, &Plan.getVectorTripCount()}, DL);
 }
 
 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
@@ -9230,22 +9238,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
                                             PSE, RequiresScalarEpilogueCheck,
                                             CM.foldTailByMasking(), OrigLoop);
 
-  // Don't use getDecisionAndClampRange here, because we don't know the UF
-  // so this function is better to be conservative, rather than to split
-  // it up into different VPlans.
-  // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
-  bool IVUpdateMayOverflow = false;
-  for (ElementCount VF : Range)
-    IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
-
   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
-  TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
-  // Use NUW for the induction increment if we proved that it won't overflow in
-  // the vector loop or when not folding the tail. In the later case, we know
-  // that the canonical induction increment will not overflow as the vector trip
-  // count is >= increment and a multiple of the increment.
-  bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
-  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+  addCanonicalIV(*Plan, Legal->getWidestInductionType(), DL);
 
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
                                 Builder);
@@ -9471,6 +9465,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
     return nullptr;
 
+  // Don't use getDecisionAndClampRange here, because we don't know the UF
+  // so this function is better to be conservative, rather than to split
+  // it up into different VPlans.
+  // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
+  bool IVUpdateMayOverflow = false;
+  for (ElementCount VF : Range)
+    IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
+
+  TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
   if (useActiveLaneMask(Style)) {
     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
     // TailFoldingStyle is visible there.
@@ -9516,11 +9519,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
   Term->eraseFromParent();
 
-  // Tail folding is not supported for outer loops, so the induction increment
-  // is guaranteed to not wrap.
-  bool HasNUW = true;
-  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
-                        DebugLoc());
+  addCanonicalIV(*Plan, Legal->getWidestInductionType(), DebugLoc());
 
   // Collect mapping of IR header phis to header phi recipes, to be used in
   // addScalarResumePhis.
@@ -10272,7 +10271,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
                              isa<VPScalarCastRecipe>(U) ||
                              isa<VPDerivedIVRecipe>(U) ||
                              cast<VPInstruction>(U)->getOpcode() ==
-                                 Instruction::Add;
+                                 VPInstruction::BranchOnCount;
                     }) &&
              "the canonical IV should only be used by its increment or "
-             "the canonical IV should only be used by its increment or "
+             "the canonical IV should only be used by its BranchOnCount or "
-             "the canonical IV should only be used by its increment or "
+             "the canonical IV should only be used by its BranchOnCount or "
              "ScalarIVSteps when resetting the start value");

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -20,6 +20,7 @@
 #include "LoopVectorizationPlanner.h"
 #include "VPlanCFG.h"
 #include "VPlanPatternMatch.h"
+#include "VPlanVerifier.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3239,7 +3239,8 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 
   VPCanonicalIVPHIRecipe *clone() override {
     auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
-    R->addOperand(getBackedgeValue());
+    if (getNumOperands() == 2)
+      R->addOperand(getBackedgeValue());
     return R;
   }