Skip to content

Commit e1c1bd1

Browse files
committed
[VPlan] Model branch cond to enter scalar epilogue in VPlan.
#92651
1 parent 36bc741 commit e1c1bd1

21 files changed

+561
-127
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 77 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -2972,22 +2972,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
29722972
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
29732973
nullptr, Twine(Prefix) + "scalar.ph");
29742974

2975-
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2976-
2977-
// Set up the middle block terminator. Two cases:
2978-
// 1) If we know that we must execute the scalar epilogue, emit an
2979-
// unconditional branch.
2980-
// 2) Otherwise, we must have a single unique exit block (due to how we
2981-
// implement the multiple exit case). In this case, set up a conditional
2982-
// branch from the middle block to the loop scalar preheader, and the
2983-
// exit block. completeLoopSkeleton will update the condition to use an
2984-
// iteration check, if required to decide whether to execute the remainder.
2985-
BranchInst *BrInst =
2986-
Cost->requiresScalarEpilogue(VF.isVector())
2987-
? BranchInst::Create(LoopScalarPreHeader)
2988-
: BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
2989-
Builder.getTrue());
2990-
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2975+
auto *BrInst = new UnreachableInst(LoopMiddleBlock->getContext());
29912976
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
29922977

29932978
// Update dominator for loop exit. During skeleton creation, only the vector
@@ -3094,51 +3079,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
30943079
}
30953080
}
30963081

3097-
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3098-
// The trip counts should be cached by now.
3099-
Value *Count = getTripCount();
3100-
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3101-
3102-
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3103-
3104-
// Add a check in the middle block to see if we have completed
3105-
// all of the iterations in the first vector loop. Three cases:
3106-
// 1) If we require a scalar epilogue, there is no conditional branch as
3107-
// we unconditionally branch to the scalar preheader. Do nothing.
3108-
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3109-
// Thus if tail is to be folded, we know we don't need to run the
3110-
// remainder and we can use the previous value for the condition (true).
3111-
// 3) Otherwise, construct a runtime check.
3112-
if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3113-
!Cost->foldTailByMasking()) {
3114-
// Here we use the same DebugLoc as the scalar loop latch terminator instead
3115-
// of the corresponding compare because they may have ended up with
3116-
// different line numbers and we want to avoid awkward line stepping while
3117-
// debugging. Eg. if the compare has got a line number inside the loop.
3118-
// TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3119-
// operands. Perform simplification directly on VPlan once the branch is
3120-
// modeled there.
3121-
IRBuilder<> B(LoopMiddleBlock->getTerminator());
3122-
B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3123-
Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3124-
BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3125-
BI.setCondition(CmpN);
3126-
if (hasBranchWeightMD(*ScalarLatchTerm)) {
3127-
// Assume that `Count % VectorTripCount` is equally distributed.
3128-
unsigned TripCount = UF * VF.getKnownMinValue();
3129-
assert(TripCount > 0 && "trip count should not be zero");
3130-
const uint32_t Weights[] = {1, TripCount - 1};
3131-
setBranchWeights(BI, Weights);
3132-
}
3133-
}
3134-
3135-
#ifdef EXPENSIVE_CHECKS
3136-
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3137-
#endif
3138-
3139-
return LoopVectorPreHeader;
3140-
}
3141-
31423082
std::pair<BasicBlock *, Value *>
31433083
InnerLoopVectorizer::createVectorizedLoopSkeleton(
31443084
const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3198,7 +3138,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31983138
// Emit phis for the new starting index of the scalar loop.
31993139
createInductionResumeValues(ExpandedSCEVs);
32003140

3201-
return {completeLoopSkeleton(), nullptr};
3141+
return {LoopVectorPreHeader, nullptr};
32023142
}
32033143

32043144
// Fix up external users of the induction variable. At this point, we are
@@ -3470,6 +3410,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
34703410
VF.getKnownMinValue() * UF);
34713411
}
34723412

3413+
// Helper to reorder blocks so they match the original order even after the
3414+
// order of the predecessors changes. This is only used to avoid a number of
3415+
// test changes due to reordering of incoming blocks in phi nodes and should be
3416+
// removed soon, with the tests being updated.
3417+
static void reorderIncomingBlocks(SmallVectorImpl<BasicBlock *> &Blocks,
3418+
BasicBlock *LoopMiddleBlock) {
3419+
if (Blocks.front() == LoopMiddleBlock)
3420+
std::swap(Blocks.front(), Blocks.back());
3421+
if (Blocks.size() == 3)
3422+
std::swap(Blocks[0], Blocks[1]);
3423+
}
3424+
34733425
void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
34743426
VPTransformState &State) {
34753427
// Extract the last vector element in the middle block. This will be the
@@ -3488,7 +3440,9 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
34883440
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
34893441
auto *ScalarPreheaderPhi =
34903442
Builder.CreatePHI(ScalarHeaderPhi->getType(), 2, "scalar.recur.init");
3491-
for (auto *BB : predecessors(LoopScalarPreHeader)) {
3443+
SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader));
3444+
reorderIncomingBlocks(Blocks, LoopMiddleBlock);
3445+
for (auto *BB : Blocks) {
34923446
auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR;
34933447
ScalarPreheaderPhi->addIncoming(Incoming, BB);
34943448
}
@@ -7388,7 +7342,9 @@ static void createAndCollectMergePhiForReduction(
73887342
// If we are fixing reductions in the epilogue loop then we should already
73897343
// have created a bc.merge.rdx Phi after the main vector body. Ensure that
73907344
// we carry over the incoming values correctly.
7391-
for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7345+
SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader));
7346+
reorderIncomingBlocks(Blocks, LoopMiddleBlock);
7347+
for (auto *Incoming : Blocks) {
73927348
if (Incoming == LoopMiddleBlock)
73937349
BCBlockPhi->addIncoming(FinalValue, Incoming);
73947350
else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
@@ -7459,6 +7415,21 @@ LoopVectorizationPlanner::executePlan(
74597415
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
74607416
ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
74617417
: State.ExpandedSCEVs);
7418+
#ifdef EXPENSIVE_CHECKS
7419+
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7420+
#endif
7421+
7422+
VPBasicBlock *MiddleVPBB =
7423+
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7424+
7425+
using namespace llvm::VPlanPatternMatch;
7426+
if (MiddleVPBB->begin() != MiddleVPBB->end() &&
7427+
match(&MiddleVPBB->back(), m_BranchOnCond(m_VPValue()))) {
7428+
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[1])
7429+
->resetBlock(OrigLoop->getLoopPreheader());
7430+
} else
7431+
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0])
7432+
->resetBlock(OrigLoop->getLoopPreheader());
74627433

74637434
// Only use noalias metadata when using memory checks guaranteeing no overlap
74647435
// across all iterations.
@@ -7539,6 +7510,18 @@ LoopVectorizationPlanner::executePlan(
75397510

75407511
ILV.printDebugTracesAtEnd();
75417512

7513+
// Adjust branch weight of the branch in the middle block.
7514+
auto *MiddleTerm =
7515+
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7516+
if (MiddleTerm->isConditional() &&
7517+
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7518+
// Assume that `Count % VectorTripCount` is equally distributed.
7519+
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7520+
assert(TripCount > 0 && "trip count should not be zero");
7521+
const uint32_t Weights[] = {1, TripCount - 1};
7522+
setBranchWeights(*MiddleTerm, Weights);
7523+
}
7524+
75427525
return {State.ExpandedSCEVs, ReductionResumeValues};
75437526
}
75447527

@@ -7595,7 +7578,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
75957578
// inductions in the epilogue loop are created before executing the plan for
75967579
// the epilogue loop.
75977580

7598-
return {completeLoopSkeleton(), nullptr};
7581+
return {LoopVectorPreHeader, nullptr};
75997582
}
76007583

76017584
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7719,8 +7702,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
77197702
VecEpilogueIterationCountCheck,
77207703
VecEpilogueIterationCountCheck->getSinglePredecessor());
77217704

7722-
DT->changeImmediateDominator(LoopScalarPreHeader,
7723-
EPI.EpilogueIterationCountCheck);
7705+
if (auto *N = DT->getNode(LoopScalarPreHeader))
7706+
DT->changeImmediateDominator(LoopScalarPreHeader,
7707+
EPI.EpilogueIterationCountCheck);
7708+
else
7709+
DT->addNewBlock(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck);
77247710
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
77257711
// If there is an epilogue which must run, there's no edge from the
77267712
// middle block to exit blocks and thus no need to update the immediate
@@ -7784,7 +7770,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
77847770
{VecEpilogueIterationCountCheck,
77857771
EPI.VectorTripCount} /* AdditionalBypass */);
77867772

7787-
return {completeLoopSkeleton(), EPResumeVal};
7773+
return {LoopVectorPreHeader, EPResumeVal};
77887774
}
77897775

77907776
BasicBlock *
@@ -8534,9 +8520,25 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85348520
// modified; a basic block for the vector pre-header, followed by a region for
85358521
// the vector loop, followed by the middle basic block. The skeleton vector
85368522
// loop region contains a header and latch basic blocks.
8523+
8524+
// Add a check in the middle block to see if we have completed
8525+
// all of the iterations in the first vector loop. Three cases:
8526+
// 1) If we require a scalar epilogue, there is no conditional branch as
8527+
// we unconditionally branch to the scalar preheader. Do nothing.
8528+
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
8529+
// Thus if tail is to be folded, we know we don't need to run the
8530+
// remainder and we can use the previous value for the condition (true).
8531+
// 3) Otherwise, construct a runtime check.
8532+
bool RequiresScalarEpilogueCheck =
8533+
LoopVectorizationPlanner::getDecisionAndClampRange(
8534+
[this](ElementCount VF) {
8535+
return !CM.requiresScalarEpilogue(VF.isVector());
8536+
},
8537+
Range);
85378538
VPlanPtr Plan = VPlan::createInitialVPlan(
85388539
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8539-
*PSE.getSE(), OrigLoop->getLoopPreheader());
8540+
*PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(),
8541+
OrigLoop);
85408542
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
85418543
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
85428544
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
@@ -8784,7 +8786,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
87848786
// Create new empty VPlan
87858787
auto Plan = VPlan::createInitialVPlan(
87868788
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8787-
*PSE.getSE(), OrigLoop->getLoopPreheader());
8789+
*PSE.getSE(), true, false, OrigLoop);
87888790

87898791
// Build hierarchical CFG
87908792
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -8993,6 +8995,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
89938995
}
89948996
}
89958997
Builder.setInsertPoint(&*LatchVPBB->begin());
8998+
VPBasicBlock *MiddleVPBB =
8999+
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
9000+
VPBasicBlock::iterator IP = MiddleVPBB->begin();
89969001
for (VPRecipeBase &R :
89979002
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
89989003
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9101,8 +9106,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91019106
// also modeled in VPlan.
91029107
auto *FinalReductionResult = new VPInstruction(
91039108
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9104-
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9105-
->appendRecipe(FinalReductionResult);
9109+
FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9110+
IP = std::next(FinalReductionResult->getIterator());
91069111
OrigExitingVPV->replaceUsesWithIf(
91079112
FinalReductionResult,
91089113
[](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });

0 commit comments

Comments
 (0)