@@ -543,11 +543,6 @@ class InnerLoopVectorizer {
543
543
protected:
544
544
friend class LoopVectorizationPlanner;
545
545
546
- /// Set up the values of the IVs correctly when exiting the vector loop.
547
- virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
548
- Value *VectorTripCount, BasicBlock *MiddleBlock,
549
- VPTransformState &State);
550
-
551
546
/// Iteratively sink the scalarized operands of a predicated instruction into
552
547
/// the block that was created for it.
553
548
void sinkScalarOperands(Instruction *PredInst);
@@ -785,10 +780,6 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
785
780
BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
786
781
void printDebugTracesAtStart() override;
787
782
void printDebugTracesAtEnd() override;
788
-
789
- void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
790
- Value *VectorTripCount, BasicBlock *MiddleBlock,
791
- VPTransformState &State) override {};
792
783
};
793
784
794
785
// A specialized derived class of inner loop vectorizer that performs
@@ -2782,97 +2773,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2782
2773
return LoopVectorPreHeader;
2783
2774
}
2784
2775
2785
- // Fix up external users of the induction variable. At this point, we are
2786
- // in LCSSA form, with all external PHIs that use the IV having one input value,
2787
- // coming from the remainder loop. We need those PHIs to also have a correct
2788
- // value for the IV when arriving directly from the middle block.
2789
- void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2790
- const InductionDescriptor &II,
2791
- Value *VectorTripCount,
2792
- BasicBlock *MiddleBlock,
2793
- VPTransformState &State) {
2794
- // There are two kinds of external IV usages - those that use the value
2795
- // computed in the last iteration (the PHI) and those that use the penultimate
2796
- // value (the value that feeds into the phi from the loop latch).
2797
- // We allow both, but they, obviously, have different values.
2798
-
2799
- DenseMap<Value *, Value *> MissingVals;
2800
-
2801
- Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
2802
- OrigLoop->getLoopPreheader()))
2803
- ->getIncomingValueForBlock(MiddleBlock);
2804
-
2805
- // An external user of the last iteration's value should see the value that
2806
- // the remainder loop uses to initialize its own IV.
2807
- Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2808
- for (User *U : PostInc->users()) {
2809
- Instruction *UI = cast<Instruction>(U);
2810
- if (!OrigLoop->contains(UI)) {
2811
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
2812
- MissingVals[UI] = EndValue;
2813
- }
2814
- }
2815
-
2816
- // An external user of the penultimate value need to see EndValue - Step.
2817
- // The simplest way to get this is to recompute it from the constituent SCEVs,
2818
- // that is Start + (Step * (CRD - 1)).
2819
- for (User *U : OrigPhi->users()) {
2820
- auto *UI = cast<Instruction>(U);
2821
- if (!OrigLoop->contains(UI)) {
2822
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
2823
- IRBuilder<> B(MiddleBlock->getTerminator());
2824
-
2825
- // Fast-math-flags propagate from the original induction instruction.
2826
- if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2827
- B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2828
-
2829
- VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2830
- assert(StepVPV && "step must have been expanded during VPlan execution");
2831
- Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2832
- : State.get(StepVPV, VPLane(0));
2833
- Value *Escape = nullptr;
2834
- if (EndValue->getType()->isIntegerTy())
2835
- Escape = B.CreateSub(EndValue, Step);
2836
- else if (EndValue->getType()->isPointerTy())
2837
- Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
2838
- else {
2839
- assert(EndValue->getType()->isFloatingPointTy() &&
2840
- "Unexpected induction type");
2841
- Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
2842
- Instruction::FAdd
2843
- ? Instruction::FSub
2844
- : Instruction::FAdd,
2845
- EndValue, Step);
2846
- }
2847
- Escape->setName("ind.escape");
2848
- MissingVals[UI] = Escape;
2849
- }
2850
- }
2851
-
2852
- assert((MissingVals.empty() ||
2853
- all_of(MissingVals,
2854
- [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2855
- return all_of(
2856
- predecessors(cast<Instruction>(P.first)->getParent()),
2857
- [MiddleBlock, this](BasicBlock *Pred) {
2858
- return Pred == MiddleBlock ||
2859
- Pred == OrigLoop->getLoopLatch();
2860
- });
2861
- })) &&
2862
- "Expected escaping values from latch/middle.block only");
2863
-
2864
- for (auto &I : MissingVals) {
2865
- PHINode *PHI = cast<PHINode>(I.first);
2866
- // One corner case we have to handle is two IVs "chasing" each-other,
2867
- // that is %IV2 = phi [...], [ %IV1, %latch ]
2868
- // In this case, if IV1 has an external use, we need to avoid adding both
2869
- // "last value of IV1" and "penultimate value of IV2". So, verify that we
2870
- // don't already have an incoming value for the middle block.
2871
- if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2872
- PHI->addIncoming(I.second, MiddleBlock);
2873
- }
2874
- }
2875
-
2876
2776
namespace {
2877
2777
2878
2778
struct CSEDenseMapInfo {
@@ -2999,24 +2899,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2999
2899
for (PHINode &PN : Exit->phis())
3000
2900
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3001
2901
3002
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
3003
- // No edge from the middle block to the unique exit block has been inserted
3004
- // and there is nothing to fix from vector loop; phis should have incoming
3005
- // from scalar loop only.
3006
- } else {
3007
- // TODO: Check in VPlan to see if IV users need fixing instead of checking
3008
- // the cost model.
3009
-
3010
- // If we inserted an edge from the middle block to the unique exit block,
3011
- // update uses outside the loop (phis) to account for the newly inserted
3012
- // edge.
3013
-
3014
- // Fix-up external users of the induction variables.
3015
- for (const auto &Entry : Legal->getInductionVars())
3016
- fixupIVUsers(Entry.first, Entry.second,
3017
- getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
3018
- }
3019
-
3020
2902
// Don't apply optimizations below when no vector region remains, as they all
3021
2903
// require a vector loop at the moment.
3022
2904
if (!State.Plan->getVectorLoopRegion())
@@ -9049,11 +8931,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
9049
8931
/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
9050
8932
/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
9051
8933
/// the end value of the induction.
9052
- static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
9053
- VPBuilder &VectorPHBuilder,
9054
- VPBuilder &ScalarPHBuilder,
9055
- VPTypeAnalysis &TypeInfo,
9056
- VPValue *VectorTC) {
8934
+ static VPInstruction *addResumePhiRecipeForInduction(
8935
+ VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8936
+ VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
9057
8937
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
9058
8938
// Truncated wide inductions resume from the last lane of their vector value
9059
8939
// in the last vector iteration which is handled elsewhere.
@@ -9087,8 +8967,10 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
9087
8967
9088
8968
/// Create resume phis in the scalar preheader for first-order recurrences,
9089
8969
/// reductions and inductions, and update the VPIRInstructions wrapping the
9090
- /// original phis in the scalar header.
9091
- static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
8970
+ /// original phis in the scalar header. End values for inductions are added to
8971
+ /// \p IVEndValues.
8972
+ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8973
+ DenseMap<VPValue *, VPValue *> &IVEndValues) {
9092
8974
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9093
8975
auto *ScalarPH = Plan.getScalarPreheader();
9094
8976
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
@@ -9105,11 +8987,16 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
9105
8987
if (!ScalarPhiI)
9106
8988
break;
9107
8989
8990
+ // TODO: Extract final value from induction recipe initially, optimize to
8991
+ // pre-computed end value together in optimizeInductionExitUsers.
9108
8992
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9109
8993
if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9110
- if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
8994
+ if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
9111
8995
WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9112
8996
&Plan.getVectorTripCount())) {
8997
+ assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
8998
+ "Expected a ResumePhi");
8999
+ IVEndValues[WideIVR] = ResumePhi->getOperand(0);
9113
9000
ScalarPhiIRI->addOperand(ResumePhi);
9114
9001
continue;
9115
9002
}
@@ -9140,65 +9027,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
9140
9027
}
9141
9028
}
9142
9029
9143
- /// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9144
- /// either an untruncated wide induction, or if it increments a wide induction
9145
- /// by its step.
9146
- static bool isOptimizableIVOrUse(VPValue *VPV) {
9147
- VPRecipeBase *Def = VPV->getDefiningRecipe();
9148
- if (!Def)
9149
- return false;
9150
- auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9151
- if (WideIV) {
9152
- // VPV itself is a wide induction, separately compute the end value for exit
9153
- // users if it is not a truncated IV.
9154
- return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9155
- !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
9156
- }
9157
-
9158
- // Check if VPV is an optimizable induction increment.
9159
- if (Def->getNumOperands() != 2)
9160
- return false;
9161
- WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
9162
- if (!WideIV)
9163
- WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
9164
- if (!WideIV)
9165
- return false;
9166
-
9167
- using namespace VPlanPatternMatch;
9168
- auto &ID = WideIV->getInductionDescriptor();
9169
-
9170
- // Check if VPV increments the induction by the induction step.
9171
- VPValue *IVStep = WideIV->getStepValue();
9172
- switch (ID.getInductionOpcode()) {
9173
- case Instruction::Add:
9174
- return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
9175
- m_Specific(IVStep)));
9176
- case Instruction::FAdd:
9177
- return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
9178
- m_Specific(IVStep)));
9179
- case Instruction::FSub:
9180
- return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
9181
- m_Specific(IVStep)));
9182
- case Instruction::Sub: {
9183
- // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9184
- // IVStep.
9185
- VPValue *Step;
9186
- if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
9187
- !Step->isLiveIn() || !IVStep->isLiveIn())
9188
- return false;
9189
- auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
9190
- auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
9191
- return StepCI && IVStepCI &&
9192
- StepCI->getValue() == (-1 * IVStepCI->getValue());
9193
- }
9194
- default:
9195
- return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
9196
- match(VPV, m_GetElementPtr(m_Specific(WideIV),
9197
- m_Specific(WideIV->getStepValue())));
9198
- }
9199
- llvm_unreachable("should have been covered by switch above");
9200
- }
9201
-
9202
9030
// Collect VPIRInstructions for phis in the exit blocks that are modeled
9203
9031
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
9204
9032
// modeled explicitly yet and won't be included. Those are un-truncated
@@ -9228,12 +9056,6 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9228
9056
}
9229
9057
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9230
9058
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9231
- // Exit values for inductions are computed and updated outside of VPlan
9232
- // and independent of induction recipes.
9233
- // TODO: Compute induction exit values in VPlan.
9234
- if (isOptimizableIVOrUse(V) &&
9235
- ExitVPBB->getSinglePredecessor() == MiddleVPBB)
9236
- continue;
9237
9059
ExitUsersToFix.insert(ExitIRI);
9238
9060
ExitIRI->addOperand(V);
9239
9061
}
@@ -9253,6 +9075,7 @@ addUsersInExitBlocks(VPlan &Plan,
9253
9075
9254
9076
auto *MiddleVPBB = Plan.getMiddleBlock();
9255
9077
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9078
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9256
9079
9257
9080
// Introduce extract for exiting values and update the VPIRInstructions
9258
9081
// modeling the corresponding LCSSA phis.
@@ -9574,7 +9397,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9574
9397
VPlanTransforms::handleUncountableEarlyExit(
9575
9398
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9576
9399
}
9577
- addScalarResumePhis(RecipeBuilder, *Plan);
9400
+ DenseMap<VPValue *, VPValue *> IVEndValues;
9401
+ addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9578
9402
SetVector<VPIRInstruction *> ExitUsersToFix =
9579
9403
collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9580
9404
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
@@ -9657,6 +9481,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9657
9481
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9658
9482
WithoutRuntimeCheck);
9659
9483
}
9484
+ VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
9660
9485
9661
9486
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9662
9487
return Plan;
@@ -9708,7 +9533,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9708
9533
auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9709
9534
RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9710
9535
}
9711
- addScalarResumePhis(RecipeBuilder, *Plan);
9536
+ DenseMap<VPValue *, VPValue *> IVEndValues;
9537
+ // TODO: IVEndValues are not used yet in the native path, to optimize exit
9538
+ // values.
9539
+ addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9712
9540
9713
9541
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9714
9542
return Plan;
0 commit comments