-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[VPlan] Dispatch to multiple exit blocks via middle blocks. #112138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 27 commits
245b56a
47258de
9265fb1
3831acb
e64888a
64db0ee
3259e66
0f8aedf
9212f96
5cb0851
e849195
7b98d34
c53eca6
43a8ef7
e26af8e
06c3d39
552bd91
2042a43
00dea4a
7b8866d
4d5608f
b9ee739
43d5590
cba7dce
95f4276
c3d3b39
a875249
65d0288
8d04383
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
digraph VPlan { | ||
graph [labelloc=t, fontsize=30; label=""] | ||
node [shape=rect, fontname=Courier, fontsize=30] | ||
edge [fontname=Courier, fontsize=30] | ||
compound=true | ||
N1 [label = | ||
"vector.ph" | ||
] | ||
N1 -> N2 [ label="" lhead=cluster_N3] | ||
subgraph cluster_N3 { | ||
fontname=Courier | ||
label="\<x1\> vector loop" | ||
N2 [label = | ||
"vector.body" | ||
] | ||
} | ||
N2 -> N4 [ label="" ltail=cluster_N3] | ||
N4 [label = | ||
"middle.split" | ||
] | ||
N4 -> N5 [ label=""] | ||
N4 -> N6 [ label=""] | ||
N5 [label = | ||
"early.exit" | ||
] | ||
N6 [label = | ||
"middle.block" | ||
] | ||
N6 -> N9 [ label=""] | ||
N6 -> N7 [ label=""] | ||
N7 [label = | ||
"scalar.ph" | ||
] | ||
N7 -> N8 [ label=""] | ||
N8 [label = | ||
"loop.header" | ||
] | ||
N9 [label = | ||
"latch.exit" | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent( | |
cl::Hidden, | ||
cl::desc("Try wider VFs if they enable the use of vector variants")); | ||
|
||
static cl::opt<bool> EnableEarlyExitVectorization( | ||
"enable-early-exit-vectorization", cl::init(false), cl::Hidden, | ||
cl::desc( | ||
"Enable vectorization of early exit loops with uncountable exits.")); | ||
|
||
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV | ||
// variables not overflowing do not hold. See `emitSCEVChecks`. | ||
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; | ||
|
@@ -1382,9 +1387,10 @@ class LoopVectorizationCostModel { | |
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); | ||
return false; | ||
} | ||
// If we might exit from anywhere but the latch, must run the exiting | ||
// iteration in scalar form. | ||
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { | ||
// If we might exit from anywhere but the latch and early exit vectorization | ||
// is disabled, we must run the exiting iteration in scalar form. | ||
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && | ||
!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) { | ||
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting " | ||
"from latch block\n"); | ||
return true; | ||
|
@@ -3656,10 +3662,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { | |
|
||
// Start with the conditional branches exiting the loop. If the branch | ||
// condition is an instruction contained in the loop that is only used by the | ||
// branch, it is uniform. | ||
// branch, it is uniform. Note conditions from uncountable early exits are not | ||
// uniform. | ||
SmallVector<BasicBlock *> Exiting; | ||
TheLoop->getExitingBlocks(Exiting); | ||
for (BasicBlock *E : Exiting) { | ||
if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E) | ||
continue; | ||
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); | ||
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) | ||
AddToWorklistIfAllowed(Cmp); | ||
|
@@ -7850,16 +7859,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( | |
|
||
ILV.printDebugTracesAtEnd(); | ||
|
||
// 4. Adjust branch weight of the branch in the middle block. | ||
auto *MiddleTerm = | ||
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); | ||
if (MiddleTerm->isConditional() && | ||
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { | ||
// Assume that `Count % VectorTripCount` is equally distributed. | ||
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); | ||
assert(TripCount > 0 && "trip count should not be zero"); | ||
const uint32_t Weights[] = {1, TripCount - 1}; | ||
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); | ||
// 4. Adjust branch weight of the branch in the middle block if it exists. | ||
if (ExitVPBB) { | ||
auto *MiddleTerm = | ||
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); | ||
if (MiddleTerm->isConditional() && | ||
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { | ||
// Assume that `Count % VectorTripCount` is equally distributed. | ||
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); | ||
assert(TripCount > 0 && "trip count should not be zero"); | ||
const uint32_t Weights[] = {1, TripCount - 1}; | ||
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); | ||
} | ||
} | ||
|
||
return State.ExpandedSCEVs; | ||
|
@@ -8239,8 +8250,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { | |
|
||
// If source is an exiting block, we know the exit edge is dynamically dead | ||
// in the vector loop, and thus we don't need to restrict the mask. Avoid | ||
// adding uses of an otherwise potentially dead instruction. | ||
if (OrigLoop->isLoopExiting(Src)) | ||
// adding uses of an otherwise potentially dead instruction unless we are | ||
// vectorizing a loop with uncountable exits. In that case, we always | ||
// materialize the mask. | ||
if (OrigLoop->isLoopExiting(Src) && | ||
Src != Legal->getUncountableEarlyExitingBlock()) | ||
return EdgeMaskCache[Edge] = SrcMask; | ||
|
||
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); | ||
|
@@ -8931,50 +8945,58 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { | |
static SetVector<VPIRInstruction *> collectUsersInExitBlocks( | ||
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, | ||
const MapVector<PHINode *, InductionDescriptor> &Inductions) { | ||
auto *MiddleVPBB = Plan.getMiddleBlock(); | ||
SetVector<VPIRInstruction *> ExitUsersToFix; | ||
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { | ||
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock(); | ||
BasicBlock *ExitingBB = find_singleton<BasicBlock>( | ||
to_vector(predecessors(ExitBB)), | ||
[OrigLoop](BasicBlock *Pred, bool AllowRepeats) { | ||
return OrigLoop->contains(Pred) ? Pred : nullptr; | ||
}); | ||
for (VPRecipeBase &R : *ExitVPBB) { | ||
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R); | ||
if (!ExitIRI) | ||
continue; | ||
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction()); | ||
if (!ExitPhi) | ||
break; | ||
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); | ||
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); | ||
// Exit values for inductions are computed and updated outside of VPlan | ||
// and independent of induction recipes. | ||
// TODO: Compute induction exit values in VPlan. | ||
if ((isa<VPWidenIntOrFpInductionRecipe>(V) && | ||
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || | ||
isa<VPWidenPointerInductionRecipe>(V) || | ||
(isa<Instruction>(IncomingValue) && | ||
OrigLoop->contains(cast<Instruction>(IncomingValue)) && | ||
any_of(IncomingValue->users(), [&Inductions](User *U) { | ||
auto *P = dyn_cast<PHINode>(U); | ||
return P && Inductions.contains(P); | ||
}))) | ||
continue; | ||
ExitUsersToFix.insert(ExitIRI); | ||
ExitIRI->addOperand(V); | ||
for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) { | ||
BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); | ||
if (PredVPBB != MiddleVPBB) { | ||
SmallVector<BasicBlock *> ExitingBlocks; | ||
OrigLoop->getExitingBlocks(ExitingBlocks); | ||
assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks"); | ||
ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1] | ||
: ExitingBlocks[0]; | ||
} | ||
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); | ||
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); | ||
// Exit values for inductions are computed and updated outside of VPlan | ||
// and independent of induction recipes. | ||
// TODO: Compute induction exit values in VPlan. | ||
if ((isa<VPWidenIntOrFpInductionRecipe>(V) && | ||
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || | ||
isa<VPWidenPointerInductionRecipe>(V) || | ||
(isa<Instruction>(IncomingValue) && | ||
OrigLoop->contains(cast<Instruction>(IncomingValue)) && | ||
any_of(IncomingValue->users(), [&Inductions](User *U) { | ||
auto *P = dyn_cast<PHINode>(U); | ||
return P && Inductions.contains(P); | ||
}))) { | ||
if (ExitVPBB->getSinglePredecessor() == MiddleVPBB) | ||
continue; | ||
} | ||
ExitUsersToFix.insert(ExitIRI); | ||
ExitIRI->addOperand(V); | ||
} | ||
} | ||
} | ||
return ExitUsersToFix; | ||
} | ||
|
||
// Add exit values to \p Plan. Extracts are added for each entry in \p | ||
// ExitUsersToFix if needed and their operands are updated. | ||
static void | ||
// ExitUsersToFix if needed and their operands are updated. Returns true if all | ||
// exit users can be handled, otherwise return false. | ||
static bool | ||
addUsersInExitBlocks(VPlan &Plan, | ||
const SetVector<VPIRInstruction *> &ExitUsersToFix) { | ||
if (ExitUsersToFix.empty()) | ||
return; | ||
return true; | ||
|
||
auto *MiddleVPBB = Plan.getMiddleBlock(); | ||
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); | ||
|
@@ -8988,14 +9010,18 @@ addUsersInExitBlocks(VPlan &Plan, | |
if (V->isLiveIn()) | ||
continue; | ||
|
||
assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB && | ||
"Exit value not handled yet for this edge."); | ||
// Currently only live-ins can be used by exit values from blocks not | ||
// exiting via the vector latch through to the middle block. | ||
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) | ||
return false; | ||
|
||
LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); | ||
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, | ||
{V, Plan.getOrAddLiveIn(ConstantInt::get( | ||
IntegerType::get(Ctx, 32), 1))}); | ||
ExitIRI->setOperand(0, Ext); | ||
} | ||
return true; | ||
} | ||
|
||
/// Handle users in the exit block for first order reductions in the original | ||
|
@@ -9268,11 +9294,23 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { | |
"VPBasicBlock"); | ||
RecipeBuilder.fixHeaderPhis(); | ||
|
||
if (auto *UncountableExitingBlock = | ||
Legal->getUncountableEarlyExitingBlock()) { | ||
VPlanTransforms::handleUncountableEarlyExit( | ||
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); | ||
} | ||
addScalarResumePhis(RecipeBuilder, *Plan); | ||
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks( | ||
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); | ||
Comment on lines
9301
to
9302
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would require |
||
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); | ||
addUsersInExitBlocks(*Plan, ExitUsersToFix); | ||
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { | ||
reportVectorizationFailure( | ||
"Some exit values in loop with uncountable exit not supported yet", | ||
"Some exit values in loop with uncountable exit not supported yet", | ||
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); | ||
return nullptr; | ||
} | ||
|
||
// --------------------------------------------------------------------------- | ||
// Transform initial VPlan: Apply previously taken decisions, in order, to | ||
// bring the VPlan to its final state. | ||
|
@@ -10138,12 +10176,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
return false; | ||
} | ||
|
||
if (LVL.hasUncountableEarlyExit()) { | ||
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { | ||
reportVectorizationFailure("Auto-vectorization of loops with uncountable " | ||
"early exit is not yet supported", | ||
"early exit is not enabled", | ||
"Auto-vectorization of loops with uncountable " | ||
"early exit is not yet supported", | ||
"UncountableEarlyExitLoopsUnsupported", ORE, L); | ||
"early exit is not enabled", | ||
"UncountableEarlyExitLoopsDisabled", ORE, L); | ||
return false; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -861,14 +861,10 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, | |
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader); | ||
|
||
// Create SCEV and VPValue for the trip count. | ||
|
||
// Currently only loops with countable exits are vectorized, but calling | ||
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with | ||
// uncountable exits whilst also ensuring the symbolic maximum and known | ||
// back-edge taken count remain identical for loops with countable exits. | ||
// We use the symbolic max backedge-taken-count, which works also when | ||
// vectorizing loops with uncountable early exits. | ||
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount(); | ||
assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it still worth at least having:
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, added back, thanks |
||
BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) && | ||
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) && | ||
"Invalid loop count"); | ||
ScalarEvolution &SE = *PSE.getSE(); | ||
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV, | ||
|
@@ -903,7 +899,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, | |
// 2) If we require a scalar epilogue, there is no conditional branch as | ||
// we unconditionally branch to the scalar preheader. Do nothing. | ||
// 3) Otherwise, construct a runtime check. | ||
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); | ||
BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock(); | ||
auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); | ||
// The connection order corresponds to the operands of the conditional branch. | ||
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1232,6 +1232,9 @@ class VPInstruction : public VPRecipeWithIRFlags, | |
// operand). Only generates scalar values (either for the first lane only or | ||
// for all lanes, depending on its uses). | ||
PtrAdd, | ||
// Returns a scalar boolean value, which is true if any lane of its single | ||
// operand is true. | ||
AnyOf, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps worth adding a simple comment here? Something along the lines of:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, thanks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some explanation how AnyOf relates (or should relate) to ComputeReductionResult? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think AnyOf also needs adding to the switch statement in VPRecipeBase::mayWriteToMemory and return false? |
||
}; | ||
|
||
private: | ||
|
@@ -3884,10 +3887,10 @@ class VPlan { | |
/// whether to execute the scalar tail loop or the exit block from the loop | ||
/// latch. | ||
const VPBasicBlock *getMiddleBlock() const { | ||
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor()); | ||
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This restricts to use of getMiddleBlock() to before bypassing guards are introduced? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it doesn't really restrict the use of getMiddleBlock, it just updates the anchor point we use to identify it; the scalar preheader (and single predecessor ) can be more easily identified and works automatically with the changes to the skeleton. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Works as long as the scalar preheader has a single predecessor, i.e., until runtime guards are introduced as additional predecessors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, this will need some extra work for #114292, which I plan to land after this PR. |
||
} | ||
VPBasicBlock *getMiddleBlock() { | ||
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor()); | ||
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor()); | ||
} | ||
|
||
/// Return the VPBasicBlock for the preheader of the scalar loop. | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for adding "if it exists". Comment also meant to question when would the middle block not exist? Should this refer to cases where the middle block (which conceptually is always there, after vector loop and whatever comes next (exit(s) and/or scalar loop)), rather than being a single block (that ends with a conditional or unconditional branch), is split into multiple blocks as it has more than two successors (rather than targeting them all with a switch)?
A null ExitVPBB, OTOH, corresponds to a scalar preheader having more than one predecessor, i.e., it also has runtime guards as predecessors.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, restored the original code, always setting the branch weights for the middle block which contains the branch on the trip count