@@ -2970,33 +2970,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2970
2970
LoopScalarPreHeader =
2971
2971
SplitBlock (LoopMiddleBlock, LoopMiddleBlock->getTerminator (), DT, LI,
2972
2972
nullptr , Twine (Prefix) + " scalar.ph" );
2973
-
2974
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
2975
-
2976
- // Set up the middle block terminator. Two cases:
2977
- // 1) If we know that we must execute the scalar epilogue, emit an
2978
- // unconditional branch.
2979
- // 2) Otherwise, we must have a single unique exit block (due to how we
2980
- // implement the multiple exit case). In this case, set up a conditional
2981
- // branch from the middle block to the loop scalar preheader, and the
2982
- // exit block. completeLoopSkeleton will update the condition to use an
2983
- // iteration check, if required to decide whether to execute the remainder.
2984
- BranchInst *BrInst =
2985
- Cost->requiresScalarEpilogue (VF.isVector ())
2986
- ? BranchInst::Create (LoopScalarPreHeader)
2987
- : BranchInst::Create (LoopExitBlock, LoopScalarPreHeader,
2988
- Builder.getTrue ());
2989
- BrInst->setDebugLoc (ScalarLatchTerm->getDebugLoc ());
2990
- ReplaceInstWithInst (LoopMiddleBlock->getTerminator (), BrInst);
2991
-
2992
- // Update dominator for loop exit. During skeleton creation, only the vector
2993
- // pre-header and the middle block are created. The vector loop is entirely
2994
- // created during VPlan exection.
2995
- if (!Cost->requiresScalarEpilogue (VF.isVector ()))
2996
- // If there is an epilogue which must run, there's no edge from the
2997
- // middle block to exit blocks and thus no need to update the immediate
2998
- // dominator of the exit blocks.
2999
- DT->changeImmediateDominator (LoopExitBlock, LoopMiddleBlock);
3000
2973
}
3001
2974
3002
2975
PHINode *InnerLoopVectorizer::createInductionResumeValue (
@@ -3093,51 +3066,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
3093
3066
}
3094
3067
}
3095
3068
3096
- BasicBlock *InnerLoopVectorizer::completeLoopSkeleton () {
3097
- // The trip counts should be cached by now.
3098
- Value *Count = getTripCount ();
3099
- Value *VectorTripCount = getOrCreateVectorTripCount (LoopVectorPreHeader);
3100
-
3101
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3102
-
3103
- // Add a check in the middle block to see if we have completed
3104
- // all of the iterations in the first vector loop. Three cases:
3105
- // 1) If we require a scalar epilogue, there is no conditional branch as
3106
- // we unconditionally branch to the scalar preheader. Do nothing.
3107
- // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3108
- // Thus if tail is to be folded, we know we don't need to run the
3109
- // remainder and we can use the previous value for the condition (true).
3110
- // 3) Otherwise, construct a runtime check.
3111
- if (!Cost->requiresScalarEpilogue (VF.isVector ()) &&
3112
- !Cost->foldTailByMasking ()) {
3113
- // Here we use the same DebugLoc as the scalar loop latch terminator instead
3114
- // of the corresponding compare because they may have ended up with
3115
- // different line numbers and we want to avoid awkward line stepping while
3116
- // debugging. Eg. if the compare has got a line number inside the loop.
3117
- // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3118
- // operands. Perform simplification directly on VPlan once the branch is
3119
- // modeled there.
3120
- IRBuilder<> B (LoopMiddleBlock->getTerminator ());
3121
- B.SetCurrentDebugLocation (ScalarLatchTerm->getDebugLoc ());
3122
- Value *CmpN = B.CreateICmpEQ (Count, VectorTripCount, " cmp.n" );
3123
- BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator ());
3124
- BI.setCondition (CmpN);
3125
- if (hasBranchWeightMD (*ScalarLatchTerm)) {
3126
- // Assume that `Count % VectorTripCount` is equally distributed.
3127
- unsigned TripCount = UF * VF.getKnownMinValue ();
3128
- assert (TripCount > 0 && " trip count should not be zero" );
3129
- const uint32_t Weights[] = {1 , TripCount - 1 };
3130
- setBranchWeights (BI, Weights, /* IsExpected=*/ false );
3131
- }
3132
- }
3133
-
3134
- #ifdef EXPENSIVE_CHECKS
3135
- assert (DT->verify (DominatorTree::VerificationLevel::Fast));
3136
- #endif
3137
-
3138
- return LoopVectorPreHeader;
3139
- }
3140
-
3141
3069
std::pair<BasicBlock *, Value *>
3142
3070
InnerLoopVectorizer::createVectorizedLoopSkeleton (
3143
3071
const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3160,7 +3088,8 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
3160
3088
| [ ]_| <-- vector loop (created during VPlan execution).
3161
3089
| |
3162
3090
| v
3163
- \ -[ ] <--- middle-block.
3091
+ \ -[ ] <--- middle-block (branch to successors created during VPlan
3092
+ | | execution)
3164
3093
\/ |
3165
3094
/\ v
3166
3095
| ->[ ] <--- new preheader.
@@ -3197,7 +3126,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
3197
3126
// Emit phis for the new starting index of the scalar loop.
3198
3127
createInductionResumeValues (ExpandedSCEVs);
3199
3128
3200
- return {completeLoopSkeleton () , nullptr };
3129
+ return {LoopVectorPreHeader , nullptr };
3201
3130
}
3202
3131
3203
3132
// Fix up external users of the induction variable. At this point, we are
@@ -3469,6 +3398,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3469
3398
VF.getKnownMinValue () * UF);
3470
3399
}
3471
3400
3401
+ // Helper to reorder blocks so they match the original order even after the
3402
+ // order of the predecessors changes. This is only used to avoid a number of
3403
+ // test changes due to reordering of incoming blocks in phi nodes and should be
3404
+ // removed soon, with the tests being updated.
3405
+ static void reorderIncomingBlocks (SmallVectorImpl<BasicBlock *> &Blocks,
3406
+ BasicBlock *LoopMiddleBlock) {
3407
+ if (Blocks.front () == LoopMiddleBlock)
3408
+ std::swap (Blocks.front (), Blocks.back ());
3409
+ if (Blocks.size () == 3 )
3410
+ std::swap (Blocks[0 ], Blocks[1 ]);
3411
+ }
3412
+
3472
3413
void InnerLoopVectorizer::fixFixedOrderRecurrence (VPLiveOut *LO,
3473
3414
VPTransformState &State) {
3474
3415
// Extract the last vector element in the middle block. This will be the
@@ -3487,7 +3428,9 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
3487
3428
Builder.SetInsertPoint (LoopScalarPreHeader, LoopScalarPreHeader->begin ());
3488
3429
auto *ScalarPreheaderPhi =
3489
3430
Builder.CreatePHI (ScalarHeaderPhi->getType (), 2 , " scalar.recur.init" );
3490
- for (auto *BB : predecessors (LoopScalarPreHeader)) {
3431
+ SmallVector<BasicBlock *> Blocks (predecessors (LoopScalarPreHeader));
3432
+ reorderIncomingBlocks (Blocks, LoopMiddleBlock);
3433
+ for (auto *BB : Blocks) {
3491
3434
auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR;
3492
3435
ScalarPreheaderPhi->addIncoming (Incoming, BB);
3493
3436
}
@@ -7387,7 +7330,9 @@ static void createAndCollectMergePhiForReduction(
7387
7330
// If we are fixing reductions in the epilogue loop then we should already
7388
7331
// have created a bc.merge.rdx Phi after the main vector body. Ensure that
7389
7332
// we carry over the incoming values correctly.
7390
- for (auto *Incoming : predecessors (LoopScalarPreHeader)) {
7333
+ SmallVector<BasicBlock *> Blocks (predecessors (LoopScalarPreHeader));
7334
+ reorderIncomingBlocks (Blocks, LoopMiddleBlock);
7335
+ for (auto *Incoming : Blocks) {
7391
7336
if (Incoming == LoopMiddleBlock)
7392
7337
BCBlockPhi->addIncoming (FinalValue, Incoming);
7393
7338
else if (ResumePhi && is_contained (ResumePhi->blocks (), Incoming))
@@ -7458,6 +7403,21 @@ LoopVectorizationPlanner::executePlan(
7458
7403
std::tie (State.CFG .PrevBB , CanonicalIVStartValue) =
7459
7404
ILV.createVectorizedLoopSkeleton (ExpandedSCEVs ? *ExpandedSCEVs
7460
7405
: State.ExpandedSCEVs );
7406
+ #ifdef EXPENSIVE_CHECKS
7407
+ assert (DT->verify (DominatorTree::VerificationLevel::Fast));
7408
+ #endif
7409
+
7410
+ VPBasicBlock *MiddleVPBB =
7411
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion ()->getSingleSuccessor ());
7412
+
7413
+ using namespace llvm ::VPlanPatternMatch;
7414
+ if (MiddleVPBB->begin () != MiddleVPBB->end () &&
7415
+ match (&MiddleVPBB->back (), m_BranchOnCond (m_VPValue ()))) {
7416
+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[1 ])
7417
+ ->resetBlock (OrigLoop->getLoopPreheader ());
7418
+ } else
7419
+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])
7420
+ ->resetBlock (OrigLoop->getLoopPreheader ());
7461
7421
7462
7422
// Only use noalias metadata when using memory checks guaranteeing no overlap
7463
7423
// across all iterations.
@@ -7538,6 +7498,18 @@ LoopVectorizationPlanner::executePlan(
7538
7498
7539
7499
ILV.printDebugTracesAtEnd ();
7540
7500
7501
+ // Adjust branch weight of the branch in the middle block.
7502
+ auto *MiddleTerm =
7503
+ cast<BranchInst>(State.CFG .VPBB2IRBB [ExitVPBB]->getTerminator ());
7504
+ if (MiddleTerm->isConditional () &&
7505
+ hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())) {
7506
+ // Assume that `Count % VectorTripCount` is equally distributed.
7507
+ unsigned TripCount = State.UF * State.VF .getKnownMinValue ();
7508
+ assert (TripCount > 0 && " trip count should not be zero" );
7509
+ const uint32_t Weights[] = {1 , TripCount - 1 };
7510
+ setBranchWeights (*MiddleTerm, Weights, /* IsExpected=*/ false );
7511
+ }
7512
+
7541
7513
return {State.ExpandedSCEVs , ReductionResumeValues};
7542
7514
}
7543
7515
@@ -7594,7 +7566,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7594
7566
// inductions in the epilogue loop are created before executing the plan for
7595
7567
// the epilogue loop.
7596
7568
7597
- return {completeLoopSkeleton () , nullptr };
7569
+ return {LoopVectorPreHeader , nullptr };
7598
7570
}
7599
7571
7600
7572
void EpilogueVectorizerMainLoop::printDebugTracesAtStart () {
@@ -7783,7 +7755,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7783
7755
{VecEpilogueIterationCountCheck,
7784
7756
EPI.VectorTripCount } /* AdditionalBypass */ );
7785
7757
7786
- return {completeLoopSkeleton () , EPResumeVal};
7758
+ return {LoopVectorPreHeader , EPResumeVal};
7787
7759
}
7788
7760
7789
7761
BasicBlock *
@@ -7828,7 +7800,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7828
7800
setBranchWeights (BI, Weights, /* IsExpected=*/ false );
7829
7801
}
7830
7802
ReplaceInstWithInst (Insert->getTerminator (), &BI);
7831
-
7832
7803
LoopBypassBlocks.push_back (Insert);
7833
7804
return Insert;
7834
7805
}
@@ -8533,9 +8504,25 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8533
8504
// modified; a basic block for the vector pre-header, followed by a region for
8534
8505
// the vector loop, followed by the middle basic block. The skeleton vector
8535
8506
// loop region contains a header and latch basic blocks.
8507
+
8508
+ // Add a check in the middle block to see if we have completed
8509
+ // all of the iterations in the first vector loop. Three cases:
8510
+ // 1) If we require a scalar epilogue, there is no conditional branch as
8511
+ // we unconditionally branch to the scalar preheader. Do nothing.
8512
+ // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
8513
+ // Thus if tail is to be folded, we know we don't need to run the
8514
+ // remainder and we can use the previous value for the condition (true).
8515
+ // 3) Otherwise, construct a runtime check.
8516
+ bool RequiresScalarEpilogueCheck =
8517
+ LoopVectorizationPlanner::getDecisionAndClampRange (
8518
+ [this ](ElementCount VF) {
8519
+ return !CM.requiresScalarEpilogue (VF.isVector ());
8520
+ },
8521
+ Range);
8536
8522
VPlanPtr Plan = VPlan::createInitialVPlan (
8537
8523
createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8538
- *PSE.getSE (), OrigLoop->getLoopPreheader ());
8524
+ *PSE.getSE (), RequiresScalarEpilogueCheck, CM.foldTailByMasking (),
8525
+ OrigLoop);
8539
8526
VPBasicBlock *HeaderVPBB = new VPBasicBlock (" vector.body" );
8540
8527
VPBasicBlock *LatchVPBB = new VPBasicBlock (" vector.latch" );
8541
8528
VPBlockUtils::insertBlockAfter (LatchVPBB, HeaderVPBB);
@@ -8783,7 +8770,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8783
8770
// Create new empty VPlan
8784
8771
auto Plan = VPlan::createInitialVPlan (
8785
8772
createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8786
- *PSE.getSE (), OrigLoop-> getLoopPreheader () );
8773
+ *PSE.getSE (), true , false , OrigLoop);
8787
8774
8788
8775
// Build hierarchical CFG
8789
8776
VPlanHCFGBuilder HCFGBuilder (OrigLoop, LI, *Plan);
@@ -8992,6 +8979,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
8992
8979
}
8993
8980
}
8994
8981
Builder.setInsertPoint (&*LatchVPBB->begin ());
8982
+ VPBasicBlock *MiddleVPBB =
8983
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor ());
8984
+ VPBasicBlock::iterator IP = MiddleVPBB->begin ();
8995
8985
for (VPRecipeBase &R :
8996
8986
Plan->getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
8997
8987
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9100,8 +9090,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
9100
9090
// also modeled in VPlan.
9101
9091
auto *FinalReductionResult = new VPInstruction (
9102
9092
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9103
- cast<VPBasicBlock>(VectorLoopRegion-> getSingleSuccessor ())
9104
- -> appendRecipe (FinalReductionResult );
9093
+ FinalReductionResult-> insertBefore (*MiddleVPBB, IP);
9094
+ IP = std::next (FinalReductionResult-> getIterator () );
9105
9095
OrigExitingVPV->replaceUsesWithIf (
9106
9096
FinalReductionResult,
9107
9097
[](VPUser &User, unsigned ) { return isa<VPLiveOut>(&User); });
@@ -10146,6 +10136,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10146
10136
cast<VPHeaderPHIRecipe>(&R)->setStartValue (StartVal);
10147
10137
}
10148
10138
10139
+ assert (DT->verify (DominatorTree::VerificationLevel::Fast) &&
10140
+ " DT not preserved correctly" );
10149
10141
LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
10150
10142
DT, true , &ExpandedSCEVs);
10151
10143
++LoopsEpilogueVectorized;
0 commit comments