Skip to content

Commit 99d6c6d

Browse files
authored
[VPlan] Model branch cond to enter scalar epilogue in VPlan. (#92651)
This patch moves branch condition creation to enter the scalar epilogue loop to VPlan. Modeling the branch in the middle block also requires modeling the successor blocks. This is done using the recently introduced VPIRBasicBlock. Note that the middle.block is still created as part of the skeleton and then patched in during VPlan execution. Unfortunately the skeleton needs to create the middle.block early on, as it is also used for induction resume value creation and is also needed to properly update the dominator tree during skeleton creation. After this patch lands, I plan to move induction resume value and phi node creation in the scalar preheader to VPlan. Once that is done, we should be able to create the middle.block in VPlan directly. This is a re-worked version based on the earlier https://reviews.llvm.org/D150398 and the main change is the use of VPIRBasicBlock. Depends on #92525 PR: #92651
1 parent b546096 commit 99d6c6d

File tree

103 files changed

+1621
-1226
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+1621
-1226
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 34 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -2964,34 +2964,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
29642964
LoopScalarPreHeader =
29652965
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
29662966
nullptr, Twine(Prefix) + "scalar.ph");
2967-
2968-
// Set up the middle block terminator. Two cases:
2969-
// 1) If we know that we must execute the scalar epilogue, retain the existing
2970-
// unconditional branch from the middle block to the scalar preheader. In that
2971-
// case, there's no edge from the middle block to exit blocks and thus no
2972-
// need to update the immediate dominator of the exit blocks.
2973-
if (Cost->requiresScalarEpilogue(VF.isVector())) {
2974-
assert(
2975-
LoopMiddleBlock->getSingleSuccessor() == LoopScalarPreHeader &&
2976-
" middle block should have the scalar preheader as single successor");
2977-
return;
2978-
}
2979-
2980-
// 2) Otherwise, we must have a single unique exit block (due to how we
2981-
// implement the multiple exit case). In this case, set up a conditional
2982-
// branch from the middle block to the loop scalar preheader, and the
2983-
// exit block. completeLoopSkeleton will update the condition to use an
2984-
// iteration check, if required to decide whether to execute the remainder.
2985-
BranchInst *BrInst =
2986-
BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
2987-
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2988-
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2989-
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
2990-
2991-
// Update dominator for loop exit. During skeleton creation, only the vector
2992-
// pre-header and the middle block are created. The vector loop is entirely
2993-
// created during VPlan exection.
2994-
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
29952967
}
29962968

29972969
PHINode *InnerLoopVectorizer::createInductionResumeValue(
@@ -3088,51 +3060,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
30883060
}
30893061
}
30903062

3091-
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3092-
// The trip counts should be cached by now.
3093-
Value *Count = getTripCount();
3094-
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3095-
3096-
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3097-
3098-
// Add a check in the middle block to see if we have completed
3099-
// all of the iterations in the first vector loop. Three cases:
3100-
// 1) If we require a scalar epilogue, there is no conditional branch as
3101-
// we unconditionally branch to the scalar preheader. Do nothing.
3102-
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3103-
// Thus if tail is to be folded, we know we don't need to run the
3104-
// remainder and we can use the previous value for the condition (true).
3105-
// 3) Otherwise, construct a runtime check.
3106-
if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3107-
!Cost->foldTailByMasking()) {
3108-
// Here we use the same DebugLoc as the scalar loop latch terminator instead
3109-
// of the corresponding compare because they may have ended up with
3110-
// different line numbers and we want to avoid awkward line stepping while
3111-
// debugging. Eg. if the compare has got a line number inside the loop.
3112-
// TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3113-
// operands. Perform simplification directly on VPlan once the branch is
3114-
// modeled there.
3115-
IRBuilder<> B(LoopMiddleBlock->getTerminator());
3116-
B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3117-
Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3118-
BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3119-
BI.setCondition(CmpN);
3120-
if (hasBranchWeightMD(*ScalarLatchTerm)) {
3121-
// Assume that `Count % VectorTripCount` is equally distributed.
3122-
unsigned TripCount = UF * VF.getKnownMinValue();
3123-
assert(TripCount > 0 && "trip count should not be zero");
3124-
const uint32_t Weights[] = {1, TripCount - 1};
3125-
setBranchWeights(BI, Weights, /*IsExpected=*/false);
3126-
}
3127-
}
3128-
3129-
#ifdef EXPENSIVE_CHECKS
3130-
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3131-
#endif
3132-
3133-
return LoopVectorPreHeader;
3134-
}
3135-
31363063
std::pair<BasicBlock *, Value *>
31373064
InnerLoopVectorizer::createVectorizedLoopSkeleton(
31383065
const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3155,17 +3082,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31553082
| [ ]_| <-- vector loop (created during VPlan execution).
31563083
| |
31573084
| v
3158-
\ -[ ] <--- middle-block.
3085+
\ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
3086+
| | successors created during VPlan execution)
31593087
\/ |
31603088
/\ v
3161-
| ->[ ] <--- new preheader.
3089+
| ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
31623090
| |
31633091
(opt) v <-- edge from middle to exit iff epilogue is not required.
31643092
| [ ] \
31653093
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
31663094
\ |
31673095
\ v
3168-
>[ ] <-- exit block(s).
3096+
>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
31693097
...
31703098
*/
31713099

@@ -3192,7 +3120,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31923120
// Emit phis for the new starting index of the scalar loop.
31933121
createInductionResumeValues(ExpandedSCEVs);
31943122

3195-
return {completeLoopSkeleton(), nullptr};
3123+
return {LoopVectorPreHeader, nullptr};
31963124
}
31973125

31983126
// Fix up external users of the induction variable. At this point, we are
@@ -7477,6 +7405,9 @@ LoopVectorizationPlanner::executePlan(
74777405
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
74787406
ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
74797407
: State.ExpandedSCEVs);
7408+
#ifdef EXPENSIVE_CHECKS
7409+
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7410+
#endif
74807411

74817412
// Only use noalias metadata when using memory checks guaranteeing no overlap
74827413
// across all iterations.
@@ -7557,6 +7488,18 @@ LoopVectorizationPlanner::executePlan(
75577488

75587489
ILV.printDebugTracesAtEnd();
75597490

7491+
// 4. Adjust branch weight of the branch in the middle block.
7492+
auto *MiddleTerm =
7493+
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7494+
if (MiddleTerm->isConditional() &&
7495+
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7496+
// Assume that `Count % VectorTripCount` is equally distributed.
7497+
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7498+
assert(TripCount > 0 && "trip count should not be zero");
7499+
const uint32_t Weights[] = {1, TripCount - 1};
7500+
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7501+
}
7502+
75607503
return {State.ExpandedSCEVs, ReductionResumeValues};
75617504
}
75627505

@@ -7613,7 +7556,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
76137556
// inductions in the epilogue loop are created before executing the plan for
76147557
// the epilogue loop.
76157558

7616-
return {completeLoopSkeleton(), nullptr};
7559+
return {LoopVectorPreHeader, nullptr};
76177560
}
76187561

76197562
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7802,7 +7745,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
78027745
{VecEpilogueIterationCountCheck,
78037746
EPI.VectorTripCount} /* AdditionalBypass */);
78047747

7805-
return {completeLoopSkeleton(), EPResumeVal};
7748+
return {LoopVectorPreHeader, EPResumeVal};
78067749
}
78077750

78087751
BasicBlock *
@@ -7847,7 +7790,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
78477790
setBranchWeights(BI, Weights, /*IsExpected=*/false);
78487791
}
78497792
ReplaceInstWithInst(Insert->getTerminator(), &BI);
7850-
78517793
LoopBypassBlocks.push_back(Insert);
78527794
return Insert;
78537795
}
@@ -8552,9 +8494,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85528494
// modified; a basic block for the vector pre-header, followed by a region for
85538495
// the vector loop, followed by the middle basic block. The skeleton vector
85548496
// loop region contains a header and latch basic blocks.
8497+
8498+
bool RequiresScalarEpilogueCheck =
8499+
LoopVectorizationPlanner::getDecisionAndClampRange(
8500+
[this](ElementCount VF) {
8501+
return !CM.requiresScalarEpilogue(VF.isVector());
8502+
},
8503+
Range);
85558504
VPlanPtr Plan = VPlan::createInitialVPlan(
85568505
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8557-
*PSE.getSE(), OrigLoop->getLoopPreheader());
8506+
*PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8507+
OrigLoop);
85588508
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
85598509
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
85608510
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
@@ -8802,7 +8752,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
88028752
// Create new empty VPlan
88038753
auto Plan = VPlan::createInitialVPlan(
88048754
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8805-
*PSE.getSE(), OrigLoop->getLoopPreheader());
8755+
*PSE.getSE(), true, false, OrigLoop);
88068756

88078757
// Build hierarchical CFG
88088758
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -10163,6 +10113,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1016310113
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
1016410114
}
1016510115

10116+
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10117+
"DT not preserved correctly");
1016610118
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
1016710119
DT, true, &ExpandedSCEVs);
1016810120
++LoopsEpilogueVectorized;

0 commit comments

Comments
 (0)