@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
467
467
ElementCount MinProfitableTripCount,
468
468
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
469
469
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
470
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
470
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
471
+ VPlan &Plan)
471
472
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
472
473
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
473
474
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
474
- PSI(PSI), RTChecks(RTChecks) {
475
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
475
476
// Query this against the original loop and save it here because the profile
476
477
// of the original loop header may change as the transformation happens.
477
478
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
522
523
/// and the resume values can come from an additional bypass block, the \p
523
524
/// AdditionalBypass pair provides information about the bypass block and the
524
525
/// end value on the edge from bypass to this loop.
525
- PHINode * createInductionResumeValue(
526
+ void createInductionResumeValue(
526
527
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
527
528
ArrayRef<BasicBlock *> BypassBlocks,
528
529
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
535
536
/// count of the original loop for both main loop and epilogue vectorization.
536
537
void setTripCount(Value *TC) { TripCount = TC; }
537
538
539
+ std::pair<BasicBlock *, Value *>
540
+ getInductionBypassValue(PHINode *OrigPhi) const {
541
+ return InductionBypassValues.find(OrigPhi)->second;
542
+ }
543
+
538
544
protected:
539
545
friend class LoopVectorizationPlanner;
540
546
@@ -680,6 +686,11 @@ class InnerLoopVectorizer {
680
686
/// Structure to hold information about generated runtime checks, responsible
681
687
/// for cleaning the checks, if vectorization turns out unprofitable.
682
688
GeneratedRTChecks &RTChecks;
689
+
690
+ /// Mapping of induction phis to their bypass values and bypass blocks.
691
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
692
+
693
+ VPlan &Plan;
683
694
};
684
695
685
696
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -721,10 +732,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
721
732
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
722
733
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
723
734
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
724
- GeneratedRTChecks &Checks)
735
+ GeneratedRTChecks &Checks, VPlan &Plan )
725
736
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
726
737
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
727
- CM, BFI, PSI, Checks),
738
+ CM, BFI, PSI, Checks, Plan ),
728
739
EPI(EPI) {}
729
740
730
741
// Override this function to handle the more complex control flow around the
@@ -761,9 +772,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
761
772
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
762
773
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
763
774
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
764
- GeneratedRTChecks &Check)
775
+ GeneratedRTChecks &Check, VPlan &Plan )
765
776
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
766
- EPI, LVL, CM, BFI, PSI, Check) {}
777
+ EPI, LVL, CM, BFI, PSI, Check, Plan ) {}
767
778
/// Implements the interface for creating a vectorized skeleton using the
768
779
/// *main loop* strategy (ie the first pass of vplan execution).
769
780
std::pair<BasicBlock *, Value *>
@@ -790,9 +801,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
790
801
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
791
802
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
792
803
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
793
- GeneratedRTChecks &Checks)
804
+ GeneratedRTChecks &Checks, VPlan &Plan )
794
805
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
795
- EPI, LVL, CM, BFI, PSI, Checks) {
806
+ EPI, LVL, CM, BFI, PSI, Checks, Plan ) {
796
807
TripCount = EPI.TripCount;
797
808
}
798
809
/// Implements the interface for creating a vectorized skeleton using the
@@ -2555,7 +2566,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2555
2566
nullptr, Twine(Prefix) + "scalar.ph");
2556
2567
}
2557
2568
2558
- PHINode *InnerLoopVectorizer::createInductionResumeValue(
2569
+ static void addOperandToPhiInVPIRBasicBlock(VPIRBasicBlock *VPBB, PHINode *P,
2570
+ VPValue *Op) {
2571
+ for (VPRecipeBase &R : *VPBB) {
2572
+ auto *IRI = cast<VPIRInstruction>(&R);
2573
+ if (&IRI->getInstruction() == P) {
2574
+ IRI->addOperand(Op);
2575
+ break;
2576
+ }
2577
+ }
2578
+ }
2579
+
2580
+ void InnerLoopVectorizer::createInductionResumeValue(
2559
2581
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2560
2582
ArrayRef<BasicBlock *> BypassBlocks,
2561
2583
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2590,27 +2612,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
2590
2612
}
2591
2613
}
2592
2614
2593
- // Create phi nodes to merge from the backedge-taken check block.
2594
- PHINode *BCResumeVal =
2595
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
2596
- LoopScalarPreHeader->getFirstNonPHIIt());
2597
- // Copy original phi DL over to the new one.
2598
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2615
+ VPBasicBlock *MiddleVPBB =
2616
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
2599
2617
2600
- // The new PHI merges the original incoming value, in case of a bypass,
2601
- // or the value at the end of the vectorized loop.
2602
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
2618
+ VPBasicBlock *ScalarPHVPBB = nullptr;
2619
+ if (MiddleVPBB->getNumSuccessors() == 2) {
2620
+ // Order is strict: first is the exit block, second is the scalar preheader.
2621
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
2622
+ } else {
2623
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
2624
+ }
2603
2625
2604
- // Fix the scalar body counter (PHI node).
2605
- // The old induction's phi node in the scalar body needs the truncated
2606
- // value.
2607
- for (BasicBlock *BB : BypassBlocks)
2608
- BCResumeVal->addIncoming(II.getStartValue( ), BB );
2626
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
2627
+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
2628
+ VPInstruction::ResumePhi,
2629
+ {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
2630
+ OrigPhi->getDebugLoc( ), "bc.resume.val" );
2609
2631
2610
- if (AdditionalBypass.first)
2611
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
2612
- EndValueFromAdditionalBypass);
2613
- return BCResumeVal;
2632
+ auto *ScalarLoopHeader =
2633
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
2634
+ addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
2635
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
2636
+ EndValueFromAdditionalBypass};
2614
2637
}
2615
2638
2616
2639
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2643,10 +2666,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
2643
2666
for (const auto &InductionEntry : Legal->getInductionVars()) {
2644
2667
PHINode *OrigPhi = InductionEntry.first;
2645
2668
const InductionDescriptor &II = InductionEntry.second;
2646
- PHINode *BCResumeVal = createInductionResumeValue(
2647
- OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
2648
- AdditionalBypass);
2649
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
2669
+ createInductionResumeValue(OrigPhi, II, getExpandedStep(II, ExpandedSCEVs),
2670
+ LoopBypassBlocks, AdditionalBypass);
2650
2671
}
2651
2672
}
2652
2673
@@ -7678,6 +7699,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7678
7699
// the second pass for the scalar loop. The induction resume values for the
7679
7700
// inductions in the epilogue loop are created before executing the plan for
7680
7701
// the epilogue loop.
7702
+ for (VPRecipeBase &R :
7703
+ Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
7704
+ // Create induction resume values for both widened pointer and
7705
+ // integer/fp inductions and update the start value of the induction
7706
+ // recipes to use the resume value.
7707
+ PHINode *IndPhi = nullptr;
7708
+ const InductionDescriptor *ID;
7709
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
7710
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
7711
+ ID = &Ind->getInductionDescriptor();
7712
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
7713
+ IndPhi = WidenInd->getPHINode();
7714
+ ID = &WidenInd->getInductionDescriptor();
7715
+ } else
7716
+ continue;
7717
+
7718
+ createInductionResumeValue(IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
7719
+ LoopBypassBlocks);
7720
+ }
7681
7721
7682
7722
return {LoopVectorPreHeader, nullptr};
7683
7723
}
@@ -8848,14 +8888,9 @@ static void addLiveOutsForFirstOrderRecurrences(
8848
8888
VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
8849
8889
"scalar.recur.init");
8850
8890
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
8851
- for (VPRecipeBase &R :
8852
- *cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor())) {
8853
- auto *IRI = cast<VPIRInstruction>(&R);
8854
- if (&IRI->getInstruction() == FORPhi) {
8855
- IRI->addOperand(ResumePhiRecipe);
8856
- break;
8857
- }
8858
- }
8891
+ addOperandToPhiInVPIRBasicBlock(
8892
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor()), FORPhi,
8893
+ ResumePhiRecipe);
8859
8894
8860
8895
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
8861
8896
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9582,7 +9617,7 @@ static bool processLoopInVPlanNativePath(
9582
9617
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9583
9618
F->getDataLayout(), AddBranchWeights);
9584
9619
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9585
- VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9620
+ VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan );
9586
9621
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9587
9622
<< L->getHeader()->getParent()->getName() << "\"\n");
9588
9623
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10070,11 +10105,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10070
10105
assert(IC > 1 && "interleave count should not be 1 or 0");
10071
10106
// If we decided that it is not legal to vectorize the loop, then
10072
10107
// interleave it.
10108
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10073
10109
InnerLoopVectorizer Unroller(
10074
10110
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10075
- ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks);
10111
+ ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan );
10076
10112
10077
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10078
10113
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10079
10114
10080
10115
ORE->emit([&]() {
@@ -10096,10 +10131,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10096
10131
// to be vectorized by executing the plan (potentially with a different
10097
10132
// factor) again shortly afterwards.
10098
10133
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10134
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10099
10135
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10100
- EPI, &LVL, &CM, BFI, PSI, Checks);
10136
+ EPI, &LVL, &CM, BFI, PSI, Checks,
10137
+ *BestMainPlan);
10101
10138
10102
- std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10103
10139
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10104
10140
*BestMainPlan, MainILV, DT, true);
10105
10141
++LoopsVectorized;
@@ -10108,11 +10144,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10108
10144
// edges from the first pass.
10109
10145
EPI.MainLoopVF = EPI.EpilogueVF;
10110
10146
EPI.MainLoopUF = EPI.EpilogueUF;
10147
+ VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
10111
10148
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10112
10149
ORE, EPI, &LVL, &CM, BFI, PSI,
10113
- Checks);
10150
+ Checks, BestEpiPlan );
10114
10151
10115
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
10116
10152
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10117
10153
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10118
10154
Header->setName("vec.epilog.vector.body");
@@ -10161,23 +10197,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10161
10197
RdxDesc.getRecurrenceStartValue());
10162
10198
}
10163
10199
} else {
10164
- // Create induction resume values for both widened pointer and
10165
- // integer/fp inductions and update the start value of the induction
10166
- // recipes to use the resume value.
10200
+ // Retrive the induction resume values for wide inductions from
10201
+ // their original phi nodes in the scalar loop
10167
10202
PHINode *IndPhi = nullptr;
10168
- const InductionDescriptor *ID;
10169
10203
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10170
10204
IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10171
- ID = &Ind->getInductionDescriptor();
10172
10205
} else {
10173
10206
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10174
10207
IndPhi = WidenInd->getPHINode();
10175
- ID = &WidenInd->getInductionDescriptor();
10176
10208
}
10177
-
10178
- ResumeV = MainILV.createInductionResumeValue(
10179
- IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10180
- {EPI.MainLoopIterationCountCheck});
10209
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10181
10210
}
10182
10211
assert(ResumeV && "Must have a resume value");
10183
10212
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
@@ -10189,13 +10218,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10189
10218
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10190
10219
DT, true, &ExpandedSCEVs);
10191
10220
++LoopsEpilogueVectorized;
10221
+ BasicBlock *PH = L->getLoopPreheader();
10192
10222
10223
+ for (const auto &[IVPhi, _] : LVL.getInductionVars()) {
10224
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
10225
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue(IVPhi);
10226
+ Inc->setIncomingValueForBlock(BB, V);
10227
+ }
10193
10228
if (!MainILV.areSafetyChecksAdded())
10194
10229
DisableRuntimeUnroll = true;
10195
10230
} else {
10196
10231
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10197
10232
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10198
- PSI, Checks);
10233
+ PSI, Checks, BestPlan );
10199
10234
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10200
10235
++LoopsVectorized;
10201
10236
0 commit comments