@@ -8527,9 +8527,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8527
8527
{CanonicalIVIncrement, &Plan.getVectorTripCount ()}, DL);
8528
8528
}
8529
8529
8530
- // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8531
- // original exit block.
8532
- static void addUsersInExitBlock (
8530
+ // Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that
8531
+ // are modeled in VPlan. Some exiting values are not modeled explicitly yet and
8532
+ // won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe,
8533
+ // VPWidenPointerInductionRecipe and induction increments.
8534
+ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock (
8533
8535
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8534
8536
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8535
8537
auto MiddleVPBB =
@@ -8538,9 +8540,8 @@ static void addUsersInExitBlock(
8538
8540
// and there is nothing to fix from vector loop; phis should have incoming
8539
8541
// from scalar loop only.
8540
8542
if (MiddleVPBB->getNumSuccessors () != 2 )
8541
- return ;
8542
-
8543
- // Introduce VPUsers modeling the exit values.
8543
+ return {};
8544
+ MapVector<PHINode *, VPValue *> ExitingValuesToFix;
8544
8545
BasicBlock *ExitBB =
8545
8546
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])->getIRBasicBlock ();
8546
8547
BasicBlock *ExitingBB = OrigLoop->getExitingBlock ();
@@ -8561,15 +8562,52 @@ static void addUsersInExitBlock(
8561
8562
return P && Inductions.contains (P);
8562
8563
})))
8563
8564
continue ;
8564
- Plan. addLiveOut ( &ExitPhi, V);
8565
+ ExitingValuesToFix. insert ({ &ExitPhi, V} );
8565
8566
}
8567
+ return ExitingValuesToFix;
8566
8568
}
8567
8569
8568
- // / Feed a resume value for every FOR from the vector loop to the scalar loop,
8569
- // / if middle block branches to scalar preheader, by introducing ExtractFromEnd
8570
- // / and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8571
- // / latter and corresponds to the scalar header.
8572
- static void addLiveOutsForFirstOrderRecurrences (VPlan &Plan) {
8570
+ // Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry
8571
+ // in \p ExitingValuesToFix.
8572
+ static void
8573
+ addUsersInExitBlock (VPlan &Plan,
8574
+ MapVector<PHINode *, VPValue *> &ExitingValuesToFix) {
8575
+ if (ExitingValuesToFix.empty ())
8576
+ return ;
8577
+
8578
+ auto MiddleVPBB =
8579
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
8580
+ BasicBlock *ExitBB =
8581
+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])->getIRBasicBlock ();
8582
+ // TODO: set B to MiddleVPBB->getFirstNonPhi(), taking care of affected tests.
8583
+ VPBuilder B (MiddleVPBB);
8584
+ if (auto *Terminator = MiddleVPBB->getTerminator ()) {
8585
+ auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand (0 ));
8586
+ assert ((!Condition || Condition->getParent () == MiddleVPBB) &&
8587
+ " Condition expected in MiddleVPBB" );
8588
+ B.setInsertPoint (Condition ? Condition : Terminator);
8589
+ }
8590
+
8591
+ // Introduce VPUsers modeling the exit values.
8592
+ for (const auto &[ExitPhi, V] : ExitingValuesToFix) {
8593
+ VPValue *Ext = B.createNaryOp (
8594
+ VPInstruction::ExtractFromEnd,
8595
+ {V, Plan.getOrAddLiveIn (ConstantInt::get (
8596
+ IntegerType::get (ExitBB->getContext (), 32 ), 1 ))});
8597
+ Plan.addLiveOut (ExitPhi, Ext);
8598
+ }
8599
+ }
8600
+
8601
+ // / Handle live-outs for first order reductions, both in the scalar preheader
8602
+ // / and the original exit block:
8603
+ // / 1. Feed a resume value for every FOR from the vector loop to the scalar
8604
+ // / loop, if middle block branches to scalar preheader, by introducing
8605
+ // / ExtractFromEnd and ResumePhi recipes in each, respectively, and a
8606
+ // / VPLiveOut which uses the latter and corresponds to the scalar header.
8607
+ // / 2. Feed the penultimate value of recurrences to their LCSSA phi users in
8608
+ // / the original exit block using a VPLiveOut.
8609
+ static void addLiveOutsForFirstOrderRecurrences (
8610
+ VPlan &Plan, MapVector<PHINode *, VPValue *> &ExitingValuesToFix) {
8573
8611
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion ();
8574
8612
8575
8613
// Start by finding out if middle block branches to scalar preheader, which is
@@ -8578,21 +8616,31 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
8578
8616
// TODO: Should be replaced by
8579
8617
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8580
8618
// scalar region is modeled as well.
8581
- VPBasicBlock *ScalarPHVPBB = nullptr ;
8582
8619
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor ());
8583
- for (VPBlockBase *Succ : MiddleVPBB->getSuccessors ()) {
8584
- if (isa<VPIRBasicBlock>(Succ))
8585
- continue ;
8586
- assert (!ScalarPHVPBB && " Two candidates for ScalarPHVPBB?" );
8587
- ScalarPHVPBB = cast<VPBasicBlock>(Succ);
8620
+ BasicBlock *ExitBB = nullptr ;
8621
+ VPBasicBlock *ScalarPHVPBB = nullptr ;
8622
+ if (MiddleVPBB->getNumSuccessors () == 2 ) {
8623
+ // Order is strict: first is the exit block, second is the scalar preheader.
8624
+ ExitBB =
8625
+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])->getIRBasicBlock ();
8626
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors ()[1 ]);
8627
+ } else if (ExitingValuesToFix.empty ()) {
8628
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor ());
8629
+ } else {
8630
+ ExitBB = cast<VPIRBasicBlock>(MiddleVPBB->getSingleSuccessor ())
8631
+ ->getIRBasicBlock ();
8588
8632
}
8589
- if (!ScalarPHVPBB)
8633
+ if (!ScalarPHVPBB) {
8634
+ assert (ExitingValuesToFix.empty () &&
8635
+ " missed inserting extracts for exiting values" );
8590
8636
return ;
8637
+ }
8591
8638
8592
8639
VPBuilder ScalarPHBuilder (ScalarPHVPBB);
8593
8640
VPBuilder MiddleBuilder (MiddleVPBB);
8594
8641
// Reset insert point so new recipes are inserted before terminator and
8595
8642
// condition, if there is either the former or both.
8643
+ // TODO: set MiddleBuilder to MiddleVPBB->getFirstNonPhi().
8596
8644
if (auto *Terminator = MiddleVPBB->getTerminator ()) {
8597
8645
auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand (0 ));
8598
8646
assert ((!Condition || Condition->getParent () == MiddleVPBB) &&
@@ -8601,20 +8649,110 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
8601
8649
}
8602
8650
VPValue *OneVPV = Plan.getOrAddLiveIn (
8603
8651
ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
8652
+ VPValue *TwoVPV = Plan.getOrAddLiveIn (
8653
+ ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 2 ));
8604
8654
8605
8655
for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock ()->phis ()) {
8606
8656
auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
8607
8657
if (!FOR)
8608
8658
continue ;
8609
8659
8660
+ // This is the second phase of vectorizing first-order recurrences, creating
8661
+ // extract for users outside the loop. An overview of the transformation is
8662
+ // described below. Suppose we have the following loop with some use after
8663
+ // the loop of the last a[i-1],
8664
+ //
8665
+ // for (int i = 0; i < n; ++i) {
8666
+ // t = a[i - 1];
8667
+ // b[i] = a[i] - t;
8668
+ // }
8669
+ // use t;
8670
+ //
8671
+ // There is a first-order recurrence on "a". For this loop, the shorthand
8672
+ // scalar IR looks like:
8673
+ //
8674
+ // scalar.ph:
8675
+ // s.init = a[-1]
8676
+ // br scalar.body
8677
+ //
8678
+ // scalar.body:
8679
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
8680
+ // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8681
+ // s2 = a[i]
8682
+ // b[i] = s2 - s1
8683
+ // br cond, scalar.body, exit.block
8684
+ //
8685
+ // exit.block:
8686
+ // use = lcssa.phi [s1, scalar.body]
8687
+ //
8688
+ // In this example, s1 is a recurrence because it's value depends on the
8689
+ // previous iteration. In the first phase of vectorization, we created a
8690
+ // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8691
+ // for users in the scalar preheader and exit block.
8692
+ //
8693
+ // vector.ph:
8694
+ // v_init = vector(..., ..., ..., a[-1])
8695
+ // br vector.body
8696
+ //
8697
+ // vector.body
8698
+ // i = phi [0, vector.ph], [i+4, vector.body]
8699
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
8700
+ // v2 = a[i, i+1, i+2, i+3]
8701
+ // b[i] = v2 - v1
8702
+ // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8703
+ // b[i, i+1, i+2, i+3] = v2 - v1
8704
+ // br cond, vector.body, middle.block
8705
+ //
8706
+ // middle.block:
8707
+ // vector.recur.extract.for.phi = v2(2)
8708
+ // vector.recur.extract = v2(3)
8709
+ // br cond, scalar.ph, exit.block
8710
+ //
8711
+ // scalar.ph:
8712
+ // scalar.recur.init = phi [vector.recur.extract, middle.block],
8713
+ // [s.init, otherwise]
8714
+ // br scalar.body
8715
+ //
8716
+ // scalar.body:
8717
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
8718
+ // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8719
+ // s2 = a[i]
8720
+ // b[i] = s2 - s1
8721
+ // br cond, scalar.body, exit.block
8722
+ //
8723
+ // exit.block:
8724
+ // lo = lcssa.phi [s1, scalar.body],
8725
+ // [vector.recur.extract.for.phi, middle.block]
8726
+ //
8610
8727
// Extract the resume value and create a new VPLiveOut for it.
8611
8728
auto *Resume = MiddleBuilder.createNaryOp (VPInstruction::ExtractFromEnd,
8612
8729
{FOR->getBackedgeValue (), OneVPV},
8613
8730
{}, " vector.recur.extract" );
8614
8731
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp (
8615
8732
VPInstruction::ResumePhi, {Resume, FOR->getStartValue ()}, {},
8616
8733
" scalar.recur.init" );
8617
- Plan.addLiveOut (cast<PHINode>(FOR->getUnderlyingInstr ()), ResumePhiRecipe);
8734
+ auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr ());
8735
+ Plan.addLiveOut (FORPhi, ResumePhiRecipe);
8736
+
8737
+ // Now create VPLiveOuts for users in the exit block.
8738
+ // Extract the penultimate value of the recurrence and add VPLiveOut
8739
+ // users of the recurrence splice.
8740
+
8741
+ // No edge from the middle block to the unique exit block has been inserted
8742
+ // and there is nothing to fix from vector loop; phis should have incoming
8743
+ // from scalar loop only.
8744
+ if (ExitingValuesToFix.empty ())
8745
+ continue ;
8746
+ for (User *U : FORPhi->users ()) {
8747
+ auto *UI = cast<Instruction>(U);
8748
+ if (UI->getParent () != ExitBB)
8749
+ continue ;
8750
+ VPValue *Ext = MiddleBuilder.createNaryOp (
8751
+ VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue (), TwoVPV}, {},
8752
+ " vector.recur.extract.for.phi" );
8753
+ Plan.addLiveOut (cast<PHINode>(UI), Ext);
8754
+ ExitingValuesToFix.erase (cast<PHINode>(UI));
8755
+ }
8618
8756
}
8619
8757
}
8620
8758
@@ -8769,16 +8907,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8769
8907
// After here, VPBB should not be used.
8770
8908
VPBB = nullptr ;
8771
8909
8772
- addUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
8773
- Legal->getInductionVars ());
8774
-
8775
8910
assert (isa<VPRegionBlock>(Plan->getVectorLoopRegion ()) &&
8776
8911
!Plan->getVectorLoopRegion ()->getEntryBasicBlock ()->empty () &&
8777
8912
" entry block must be set to a VPRegionBlock having a non-empty entry "
8778
8913
" VPBasicBlock" );
8779
8914
RecipeBuilder.fixHeaderPhis ();
8780
8915
8781
- addLiveOutsForFirstOrderRecurrences (*Plan);
8916
+ MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock (
8917
+ OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
8918
+
8919
+ addLiveOutsForFirstOrderRecurrences (*Plan, ExitingValuesToFix);
8920
+ addUsersInExitBlock (*Plan, ExitingValuesToFix);
8782
8921
8783
8922
// ---------------------------------------------------------------------------
8784
8923
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8931,6 +9070,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8931
9070
// iteration. The final value is selected by the final ComputeReductionResult.
8932
9071
void LoopVectorizationPlanner::adjustRecipesForReductions (
8933
9072
VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9073
+ using namespace VPlanPatternMatch ;
8934
9074
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion ();
8935
9075
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock ();
8936
9076
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
@@ -8988,10 +9128,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
8988
9128
for (unsigned I = 0 ; I != Worklist.size (); ++I) {
8989
9129
VPSingleDefRecipe *Cur = Worklist[I];
8990
9130
for (VPUser *U : Cur->users ()) {
8991
- auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
8992
- if (!UserRecipe) {
8993
- assert (isa<VPLiveOut>(U) &&
8994
- " U must either be a VPSingleDef or VPLiveOut" );
9131
+ auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9132
+ if (!UserRecipe->getParent ()->getEnclosingLoopRegion ()) {
9133
+ assert (match (U, m_Binary<VPInstruction::ExtractFromEnd>(
9134
+ m_VPValue (), m_VPValue ())) &&
9135
+ " U must be an ExtractFromEnd VPInstruction" );
8995
9136
continue ;
8996
9137
}
8997
9138
Worklist.insert (UserRecipe);
@@ -9208,9 +9349,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
9208
9349
auto *FinalReductionResult = new VPInstruction (
9209
9350
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9210
9351
FinalReductionResult->insertBefore (*MiddleVPBB, IP);
9211
- OrigExitingVPV->replaceUsesWithIf (
9212
- FinalReductionResult,
9213
- [](VPUser &User, unsigned ) { return isa<VPLiveOut>(&User); });
9352
+ OrigExitingVPV->replaceUsesWithIf (FinalReductionResult, [](VPUser &User,
9353
+ unsigned ) {
9354
+ return match (&User, m_Binary<VPInstruction::ExtractFromEnd>(m_VPValue (),
9355
+ m_VPValue ()));
9356
+ });
9214
9357
}
9215
9358
9216
9359
VPlanTransforms::clearReductionWrapFlags (*Plan);
0 commit comments