Skip to content

Commit 7d466d7

Browse files
committed
[VPlan] Model branch cond to enter scalar epilogue in VPlan.
#92651
1 parent 52d29eb commit 7d466d7

21 files changed

+567
-138
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 75 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -2970,33 +2970,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
29702970
LoopScalarPreHeader =
29712971
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
29722972
nullptr, Twine(Prefix) + "scalar.ph");
2973-
2974-
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2975-
2976-
// Set up the middle block terminator. Two cases:
2977-
// 1) If we know that we must execute the scalar epilogue, emit an
2978-
// unconditional branch.
2979-
// 2) Otherwise, we must have a single unique exit block (due to how we
2980-
// implement the multiple exit case). In this case, set up a conditional
2981-
// branch from the middle block to the loop scalar preheader, and the
2982-
// exit block. completeLoopSkeleton will update the condition to use an
2983-
// iteration check, if required to decide whether to execute the remainder.
2984-
BranchInst *BrInst =
2985-
Cost->requiresScalarEpilogue(VF.isVector())
2986-
? BranchInst::Create(LoopScalarPreHeader)
2987-
: BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
2988-
Builder.getTrue());
2989-
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2990-
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
2991-
2992-
// Update dominator for loop exit. During skeleton creation, only the vector
2993-
// pre-header and the middle block are created. The vector loop is entirely
2994-
// created during VPlan exection.
2995-
if (!Cost->requiresScalarEpilogue(VF.isVector()))
2996-
// If there is an epilogue which must run, there's no edge from the
2997-
// middle block to exit blocks and thus no need to update the immediate
2998-
// dominator of the exit blocks.
2999-
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
30002973
}
30012974

30022975
PHINode *InnerLoopVectorizer::createInductionResumeValue(
@@ -3093,51 +3066,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
30933066
}
30943067
}
30953068

3096-
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3097-
// The trip counts should be cached by now.
3098-
Value *Count = getTripCount();
3099-
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3100-
3101-
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3102-
3103-
// Add a check in the middle block to see if we have completed
3104-
// all of the iterations in the first vector loop. Three cases:
3105-
// 1) If we require a scalar epilogue, there is no conditional branch as
3106-
// we unconditionally branch to the scalar preheader. Do nothing.
3107-
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3108-
// Thus if tail is to be folded, we know we don't need to run the
3109-
// remainder and we can use the previous value for the condition (true).
3110-
// 3) Otherwise, construct a runtime check.
3111-
if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3112-
!Cost->foldTailByMasking()) {
3113-
// Here we use the same DebugLoc as the scalar loop latch terminator instead
3114-
// of the corresponding compare because they may have ended up with
3115-
// different line numbers and we want to avoid awkward line stepping while
3116-
// debugging. Eg. if the compare has got a line number inside the loop.
3117-
// TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3118-
// operands. Perform simplification directly on VPlan once the branch is
3119-
// modeled there.
3120-
IRBuilder<> B(LoopMiddleBlock->getTerminator());
3121-
B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3122-
Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3123-
BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3124-
BI.setCondition(CmpN);
3125-
if (hasBranchWeightMD(*ScalarLatchTerm)) {
3126-
// Assume that `Count % VectorTripCount` is equally distributed.
3127-
unsigned TripCount = UF * VF.getKnownMinValue();
3128-
assert(TripCount > 0 && "trip count should not be zero");
3129-
const uint32_t Weights[] = {1, TripCount - 1};
3130-
setBranchWeights(BI, Weights, /*IsExpected=*/false);
3131-
}
3132-
}
3133-
3134-
#ifdef EXPENSIVE_CHECKS
3135-
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3136-
#endif
3137-
3138-
return LoopVectorPreHeader;
3139-
}
3140-
31413069
std::pair<BasicBlock *, Value *>
31423070
InnerLoopVectorizer::createVectorizedLoopSkeleton(
31433071
const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3160,7 +3088,8 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31603088
| [ ]_| <-- vector loop (created during VPlan execution).
31613089
| |
31623090
| v
3163-
\ -[ ] <--- middle-block.
3091+
\ -[ ] <--- middle-block (branch to successors created during VPlan
3092+
| | execution)
31643093
\/ |
31653094
/\ v
31663095
| ->[ ] <--- new preheader.
@@ -3197,7 +3126,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31973126
// Emit phis for the new starting index of the scalar loop.
31983127
createInductionResumeValues(ExpandedSCEVs);
31993128

3200-
return {completeLoopSkeleton(), nullptr};
3129+
return {LoopVectorPreHeader, nullptr};
32013130
}
32023131

32033132
// Fix up external users of the induction variable. At this point, we are
@@ -3469,6 +3398,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
34693398
VF.getKnownMinValue() * UF);
34703399
}
34713400

3401+
// Helper to reorder blocks so they match the original order even after the
3402+
// order of the predecessors changes. This is only used to avoid a number of
3403+
// test changes due to reordering of incoming blocks in phi nodes and should be
3404+
// removed soon, with the tests being updated.
3405+
static void reorderIncomingBlocks(SmallVectorImpl<BasicBlock *> &Blocks,
3406+
BasicBlock *LoopMiddleBlock) {
3407+
if (Blocks.front() == LoopMiddleBlock)
3408+
std::swap(Blocks.front(), Blocks.back());
3409+
if (Blocks.size() == 3)
3410+
std::swap(Blocks[0], Blocks[1]);
3411+
}
3412+
34723413
void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
34733414
VPTransformState &State) {
34743415
// Extract the last vector element in the middle block. This will be the
@@ -3487,7 +3428,9 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
34873428
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
34883429
auto *ScalarPreheaderPhi =
34893430
Builder.CreatePHI(ScalarHeaderPhi->getType(), 2, "scalar.recur.init");
3490-
for (auto *BB : predecessors(LoopScalarPreHeader)) {
3431+
SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader));
3432+
reorderIncomingBlocks(Blocks, LoopMiddleBlock);
3433+
for (auto *BB : Blocks) {
34913434
auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR;
34923435
ScalarPreheaderPhi->addIncoming(Incoming, BB);
34933436
}
@@ -7387,7 +7330,9 @@ static void createAndCollectMergePhiForReduction(
73877330
// If we are fixing reductions in the epilogue loop then we should already
73887331
// have created a bc.merge.rdx Phi after the main vector body. Ensure that
73897332
// we carry over the incoming values correctly.
7390-
for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7333+
SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader));
7334+
reorderIncomingBlocks(Blocks, LoopMiddleBlock);
7335+
for (auto *Incoming : Blocks) {
73917336
if (Incoming == LoopMiddleBlock)
73927337
BCBlockPhi->addIncoming(FinalValue, Incoming);
73937338
else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
@@ -7458,6 +7403,21 @@ LoopVectorizationPlanner::executePlan(
74587403
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
74597404
ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
74607405
: State.ExpandedSCEVs);
7406+
#ifdef EXPENSIVE_CHECKS
7407+
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7408+
#endif
7409+
7410+
VPBasicBlock *MiddleVPBB =
7411+
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7412+
7413+
using namespace llvm::VPlanPatternMatch;
7414+
if (MiddleVPBB->begin() != MiddleVPBB->end() &&
7415+
match(&MiddleVPBB->back(), m_BranchOnCond(m_VPValue()))) {
7416+
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[1])
7417+
->resetBlock(OrigLoop->getLoopPreheader());
7418+
} else
7419+
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])
7420+
->resetBlock(OrigLoop->getLoopPreheader());
74617421

74627422
// Only use noalias metadata when using memory checks guaranteeing no overlap
74637423
// across all iterations.
@@ -7538,6 +7498,18 @@ LoopVectorizationPlanner::executePlan(
75387498

75397499
ILV.printDebugTracesAtEnd();
75407500

7501+
// Adjust branch weight of the branch in the middle block.
7502+
auto *MiddleTerm =
7503+
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7504+
if (MiddleTerm->isConditional() &&
7505+
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7506+
// Assume that `Count % VectorTripCount` is equally distributed.
7507+
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7508+
assert(TripCount > 0 && "trip count should not be zero");
7509+
const uint32_t Weights[] = {1, TripCount - 1};
7510+
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7511+
}
7512+
75417513
return {State.ExpandedSCEVs, ReductionResumeValues};
75427514
}
75437515

@@ -7594,7 +7566,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
75947566
// inductions in the epilogue loop are created before executing the plan for
75957567
// the epilogue loop.
75967568

7597-
return {completeLoopSkeleton(), nullptr};
7569+
return {LoopVectorPreHeader, nullptr};
75987570
}
75997571

76007572
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7783,7 +7755,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
77837755
{VecEpilogueIterationCountCheck,
77847756
EPI.VectorTripCount} /* AdditionalBypass */);
77857757

7786-
return {completeLoopSkeleton(), EPResumeVal};
7758+
return {LoopVectorPreHeader, EPResumeVal};
77877759
}
77887760

77897761
BasicBlock *
@@ -7828,7 +7800,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
78287800
setBranchWeights(BI, Weights, /*IsExpected=*/false);
78297801
}
78307802
ReplaceInstWithInst(Insert->getTerminator(), &BI);
7831-
78327803
LoopBypassBlocks.push_back(Insert);
78337804
return Insert;
78347805
}
@@ -8533,9 +8504,25 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85338504
// modified; a basic block for the vector pre-header, followed by a region for
85348505
// the vector loop, followed by the middle basic block. The skeleton vector
85358506
// loop region contains a header and latch basic blocks.
8507+
8508+
// Add a check in the middle block to see if we have completed
8509+
// all of the iterations in the first vector loop. Three cases:
8510+
// 1) If we require a scalar epilogue, there is no conditional branch as
8511+
// we unconditionally branch to the scalar preheader. Do nothing.
8512+
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
8513+
// Thus if tail is to be folded, we know we don't need to run the
8514+
// remainder and we can use the previous value for the condition (true).
8515+
// 3) Otherwise, construct a runtime check.
8516+
bool RequiresScalarEpilogueCheck =
8517+
LoopVectorizationPlanner::getDecisionAndClampRange(
8518+
[this](ElementCount VF) {
8519+
return !CM.requiresScalarEpilogue(VF.isVector());
8520+
},
8521+
Range);
85368522
VPlanPtr Plan = VPlan::createInitialVPlan(
85378523
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8538-
*PSE.getSE(), OrigLoop->getLoopPreheader());
8524+
*PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8525+
OrigLoop);
85398526
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
85408527
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
85418528
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
@@ -8783,7 +8770,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
87838770
// Create new empty VPlan
87848771
auto Plan = VPlan::createInitialVPlan(
87858772
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8786-
*PSE.getSE(), OrigLoop->getLoopPreheader());
8773+
*PSE.getSE(), true, false, OrigLoop);
87878774

87888775
// Build hierarchical CFG
87898776
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -8992,6 +8979,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
89928979
}
89938980
}
89948981
Builder.setInsertPoint(&*LatchVPBB->begin());
8982+
VPBasicBlock *MiddleVPBB =
8983+
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
8984+
VPBasicBlock::iterator IP = MiddleVPBB->begin();
89958985
for (VPRecipeBase &R :
89968986
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
89978987
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9100,8 +9090,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91009090
// also modeled in VPlan.
91019091
auto *FinalReductionResult = new VPInstruction(
91029092
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9103-
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9104-
->appendRecipe(FinalReductionResult);
9093+
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9094+
IP = std::next(FinalReductionResult->getIterator());
91059095
OrigExitingVPV->replaceUsesWithIf(
91069096
FinalReductionResult,
91079097
[](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
@@ -10146,6 +10136,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1014610136
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
1014710137
}
1014810138

10139+
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10140+
"DT not preserved correctly");
1014910141
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
1015010142
DT, true, &ExpandedSCEVs);
1015110143
++LoopsEpilogueVectorized;

0 commit comments

Comments
 (0)