Skip to content

Commit 3dfa213

Browse files
committed
[VPlan] Introduce multi-branch recipe, use for multi-exit loops (WIP).
This patch introduces a new BranchMultipleConds VPInstruction that takes multiple conditions and branches to the first successor if the first operand is true, to the second successor if the second condition is true and to the region header if neither is true. At the moment it only supports 2 conditions, but it can be extended in the future. This may serve as an alternative to changing VPRegionBlock to allow multiple exiting blocks and keep it single-entry-single-exit. With BranchMultipleConds, we still leave a region via a single exiting block, but can have more than 2 destinations (similar idea to switch in LLVM IR). The new recipe allows to precisely model edges and conditions leaving the vector loop region. BranchMultipleConds also allows predicating instructions in blocks after any early exit, i.e. also allows later stores. See llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll for an example VPlan and llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll for example predicated codegen. The patch also contains logic to construct VPlans using BranchMultipleConds for simple loops with 2 exit blocks instead of requiring a scalar tail. To logic to detect such cases is a bit rough around the edges and mainly to test the new recipes end-to-end. This may serve as an alternative to #108563 that would allow us to keep the single-entry-single-exit property and support predication between early exits and latches.
1 parent 644899a commit 3dfa213

File tree

9 files changed

+467
-73
lines changed

9 files changed

+467
-73
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,8 @@ class LoopVectorizationLegality {
275275
/// we can use in-order reductions.
276276
bool canVectorizeFPMath(bool EnableStrictReductions);
277277

278+
bool canVectorizeMultiCond() const;
279+
278280
/// Return true if we can vectorize this loop while folding its tail by
279281
/// masking.
280282
bool canFoldTailByMasking() const;

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
4343
cl::desc("Enable recognition of non-constant strided "
4444
"pointer induction variables."));
4545

46+
static cl::opt<bool> EnableMultiCond("enable-multi-cond-vectorization",
47+
cl::init(false), cl::Hidden, cl::desc(""));
48+
4649
namespace llvm {
4750
cl::opt<bool>
4851
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1247,6 +1250,8 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
12471250
}
12481251

12491252
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
1253+
if (canVectorizeMultiCond() && BB != TheLoop->getHeader())
1254+
return true;
12501255
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
12511256
}
12521257

@@ -1377,6 +1382,37 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
13771382
return true;
13781383
}
13791384

1385+
bool LoopVectorizationLegality::canVectorizeMultiCond() const {
1386+
if (!EnableMultiCond)
1387+
return false;
1388+
if (TheLoop->getUniqueExitBlock())
1389+
return false;
1390+
SmallVector<BasicBlock *> Exiting;
1391+
TheLoop->getExitingBlocks(Exiting);
1392+
if (Exiting.size() != 2 || Exiting[0] != TheLoop->getHeader() ||
1393+
Exiting[1] != TheLoop->getLoopLatch() ||
1394+
any_of(*TheLoop->getHeader(), [](Instruction &I) {
1395+
return I.mayReadFromMemory() || I.mayHaveSideEffects();
1396+
}))
1397+
return false;
1398+
CmpInst::Predicate Pred;
1399+
Value *A, *B;
1400+
if (!match(
1401+
TheLoop->getHeader()->getTerminator(),
1402+
m_Br(m_ICmp(Pred, m_Value(A), m_Value(B)), m_Value(), m_Value())) ||
1403+
Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE)
1404+
return false;
1405+
if (any_of(TheLoop->getBlocks(), [this](BasicBlock *BB) {
1406+
return any_of(*BB, [this](Instruction &I) {
1407+
return any_of(I.users(), [this](User *U) {
1408+
return !TheLoop->contains(cast<Instruction>(U)->getParent());
1409+
});
1410+
});
1411+
}))
1412+
return false;
1413+
return true;
1414+
}
1415+
13801416
// Helper function to canVectorizeLoopNestCFG.
13811417
bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
13821418
bool UseVPlanNativePath) {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 130 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1387,9 +1387,11 @@ class LoopVectorizationCostModel {
13871387
// If we might exit from anywhere but the latch, must run the exiting
13881388
// iteration in scalar form.
13891389
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1390-
LLVM_DEBUG(
1391-
dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1392-
return true;
1390+
if (!Legal->canVectorizeMultiCond()) {
1391+
LLVM_DEBUG(
1392+
dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1393+
return true;
1394+
}
13931395
}
13941396
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
13951397
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2571,8 +2573,17 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
25712573
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
25722574
assert(LoopVectorPreHeader && "Invalid loop structure");
25732575
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2574-
assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2575-
"multiple exit loop without required epilogue?");
2576+
if (Legal->canVectorizeMultiCond()) {
2577+
BasicBlock *Latch = OrigLoop->getLoopLatch();
2578+
BasicBlock *TrueSucc =
2579+
cast<BranchInst>(Latch->getTerminator())->getSuccessor(0);
2580+
BasicBlock *FalseSucc =
2581+
cast<BranchInst>(Latch->getTerminator())->getSuccessor(1);
2582+
LoopExitBlock = OrigLoop->contains(TrueSucc) ? FalseSucc : TrueSucc;
2583+
} else {
2584+
assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2585+
"multiple exit loop without required epilogue?");
2586+
}
25762587

25772588
LoopMiddleBlock =
25782589
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
@@ -2943,24 +2954,26 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
29432954
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
29442955
VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
29452956
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
2946-
if (Cost->requiresScalarEpilogue(VF.isVector())) {
2947-
// No edge from the middle block to the unique exit block has been inserted
2948-
// and there is nothing to fix from vector loop; phis should have incoming
2949-
// from scalar loop only.
2950-
} else {
2951-
// TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2952-
// the cost model.
2953-
2954-
// If we inserted an edge from the middle block to the unique exit block,
2955-
// update uses outside the loop (phis) to account for the newly inserted
2956-
// edge.
2957-
2958-
// Fix-up external users of the induction variables.
2959-
for (const auto &Entry : Legal->getInductionVars())
2960-
fixupIVUsers(Entry.first, Entry.second,
2961-
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
2962-
IVEndValues[Entry.first], LoopMiddleBlock,
2963-
VectorLoop->getHeader(), Plan, State);
2957+
if (OrigLoop->getUniqueExitBlock()) {
2958+
if (Cost->requiresScalarEpilogue(VF.isVector())) {
2959+
// No edge from the middle block to the unique exit block has been
2960+
// inserted and there is nothing to fix from vector loop; phis should have
2961+
// incoming from scalar loop only.
2962+
} else {
2963+
// TODO: Check VPLiveOuts to see if IV users need fixing instead of
2964+
// checking the cost model.
2965+
2966+
// If we inserted an edge from the middle block to the unique exit block,
2967+
// update uses outside the loop (phis) to account for the newly inserted
2968+
// edge.
2969+
2970+
// Fix-up external users of the induction variables.
2971+
for (const auto &Entry : Legal->getInductionVars())
2972+
fixupIVUsers(Entry.first, Entry.second,
2973+
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
2974+
IVEndValues[Entry.first], LoopMiddleBlock,
2975+
VectorLoop->getHeader(), Plan, State);
2976+
}
29642977
}
29652978

29662979
// Fix live-out phis not already fixed earlier.
@@ -3584,7 +3597,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
35843597
TheLoop->getExitingBlocks(Exiting);
35853598
for (BasicBlock *E : Exiting) {
35863599
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3587-
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3600+
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
3601+
(TheLoop->getLoopLatch() == E || !Legal->canVectorizeMultiCond()))
35883602
AddToWorklistIfAllowed(Cmp);
35893603
}
35903604

@@ -7515,7 +7529,8 @@ LoopVectorizationPlanner::executePlan(
75157529
LLVM_DEBUG(BestVPlan.dump());
75167530

75177531
// Perform the actual loop transformation.
7518-
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan);
7532+
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7533+
OrigLoop);
75197534

75207535
// 0. Generate SCEV-dependent code into the preheader, including TripCount,
75217536
// before making any changes to the CFG.
@@ -7577,12 +7592,15 @@ LoopVectorizationPlanner::executePlan(
75777592

75787593
// 2.5 Collect reduction resume values.
75797594
DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7580-
auto *ExitVPBB =
7581-
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7582-
for (VPRecipeBase &R : *ExitVPBB) {
7583-
createAndCollectMergePhiForReduction(
7584-
dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7585-
State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7595+
VPBasicBlock *ExitVPBB = nullptr;
7596+
if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
7597+
ExitVPBB = cast<VPBasicBlock>(
7598+
BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7599+
for (VPRecipeBase &R : *ExitVPBB) {
7600+
createAndCollectMergePhiForReduction(
7601+
dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop,
7602+
State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
7603+
}
75867604
}
75877605

75887606
// 2.6. Maintain Loop Hints
@@ -7608,6 +7626,7 @@ LoopVectorizationPlanner::executePlan(
76087626
LoopVectorizeHints Hints(L, true, *ORE);
76097627
Hints.setAlreadyVectorized();
76107628
}
7629+
76117630
TargetTransformInfo::UnrollingPreferences UP;
76127631
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
76137632
if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7620,15 +7639,17 @@ LoopVectorizationPlanner::executePlan(
76207639
ILV.printDebugTracesAtEnd();
76217640

76227641
// 4. Adjust branch weight of the branch in the middle block.
7623-
auto *MiddleTerm =
7624-
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7625-
if (MiddleTerm->isConditional() &&
7626-
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7627-
// Assume that `Count % VectorTripCount` is equally distributed.
7628-
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7629-
assert(TripCount > 0 && "trip count should not be zero");
7630-
const uint32_t Weights[] = {1, TripCount - 1};
7631-
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7642+
if (ExitVPBB) {
7643+
auto *MiddleTerm =
7644+
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7645+
if (MiddleTerm->isConditional() &&
7646+
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7647+
// Assume that `Count % VectorTripCount` is equally distributed.
7648+
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7649+
assert(TripCount > 0 && "trip count should not be zero");
7650+
const uint32_t Weights[] = {1, TripCount - 1};
7651+
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7652+
}
76327653
}
76337654

76347655
return {State.ExpandedSCEVs, ReductionResumeValues};
@@ -8013,7 +8034,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
80138034
// If source is an exiting block, we know the exit edge is dynamically dead
80148035
// in the vector loop, and thus we don't need to restrict the mask. Avoid
80158036
// adding uses of an otherwise potentially dead instruction.
8016-
if (OrigLoop->isLoopExiting(Src))
8037+
if (!Legal->canVectorizeMultiCond() && OrigLoop->isLoopExiting(Src))
80178038
return EdgeMaskCache[Edge] = SrcMask;
80188039

80198040
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8630,6 +8651,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
86308651
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
86318652
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
86328653
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8654+
if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
8655+
return {};
86338656
auto *MiddleVPBB =
86348657
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
86358658
// No edge from the middle block to the unique exit block has been inserted
@@ -8717,6 +8740,8 @@ static void addLiveOutsForFirstOrderRecurrences(
87178740
// TODO: Should be replaced by
87188741
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
87198742
// scalar region is modeled as well.
8743+
if (!VectorRegion->getSingleSuccessor())
8744+
return;
87208745
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
87218746
VPBasicBlock *ScalarPHVPBB = nullptr;
87228747
if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -8991,6 +9016,67 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89919016
"VPBasicBlock");
89929017
RecipeBuilder.fixHeaderPhis();
89939018

9019+
SmallVector<BasicBlock *> Exiting;
9020+
OrigLoop->getExitingBlocks(Exiting);
9021+
9022+
if (Legal->canVectorizeMultiCond()) {
9023+
auto *LatchVPBB =
9024+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getExiting());
9025+
VPBuilder::InsertPointGuard Guard(Builder);
9026+
Builder.setInsertPoint(LatchVPBB->getTerminator());
9027+
auto *MiddleVPBB =
9028+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
9029+
9030+
VPValue *EarlyExitTaken = nullptr;
9031+
SmallVector<VPValue *> ExitTaken;
9032+
SmallVector<PHINode *> ExitPhis;
9033+
SmallVector<Value *> ExitValues;
9034+
BasicBlock *ExitBlock;
9035+
for (BasicBlock *E : Exiting) {
9036+
if (E == OrigLoop->getLoopLatch()) {
9037+
BasicBlock *TrueSucc =
9038+
cast<BranchInst>(E->getTerminator())->getSuccessor(0);
9039+
BasicBlock *FalseSucc =
9040+
cast<BranchInst>(E->getTerminator())->getSuccessor(1);
9041+
auto EB = !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc;
9042+
9043+
auto *VPExitBlock = new VPIRBasicBlock(EB);
9044+
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
9045+
VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
9046+
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
9047+
continue;
9048+
}
9049+
BasicBlock *TrueSucc =
9050+
cast<BranchInst>(E->getTerminator())->getSuccessor(0);
9051+
BasicBlock *FalseSucc =
9052+
cast<BranchInst>(E->getTerminator())->getSuccessor(1);
9053+
VPValue *M = RecipeBuilder.getBlockInMask(
9054+
OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
9055+
9056+
auto *N = Builder.createNot(M);
9057+
auto *EC = Builder.createNaryOp(VPInstruction::AnyOf, {N});
9058+
ExitTaken.push_back(EC);
9059+
if (EarlyExitTaken)
9060+
EarlyExitTaken = Builder.createOr(EarlyExitTaken, EC);
9061+
else
9062+
EarlyExitTaken = EC;
9063+
ExitBlock = !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc;
9064+
}
9065+
9066+
auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
9067+
auto *IsLatchExiting = Builder.createICmp(
9068+
CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
9069+
Builder.createNaryOp(VPInstruction::BranchMultipleConds,
9070+
{EarlyExitTaken, IsLatchExiting});
9071+
Term->eraseFromParent();
9072+
9073+
auto *EA = new VPIRBasicBlock(ExitBlock);
9074+
VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
9075+
VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
9076+
VPBlockUtils::connectBlocks(LoopRegion, EA);
9077+
VPBlockUtils::connectBlocks(LoopRegion, MiddleVPBB);
9078+
}
9079+
89949080
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
89959081
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
89969082
addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
@@ -9062,6 +9148,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90629148
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
90639149
WithoutRuntimeCheck);
90649150
}
9151+
90659152
return Plan;
90669153
}
90679154

@@ -9286,6 +9373,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
92869373
}
92879374
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
92889375
Builder.setInsertPoint(&*LatchVPBB->begin());
9376+
if (!VectorLoopRegion->getSingleSuccessor())
9377+
return;
92899378
VPBasicBlock *MiddleVPBB =
92909379
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
92919380
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();

0 commit comments

Comments
 (0)