Skip to content

Commit b841e2e

Browse files
committed
Recommit "[VPlan] First step towards VPlan cost modeling. (#92555)"
This reverts commit 6f538f6. A number of crashes have been fixed by separate fixes, including ttps://github.com//pull/96622. This version of the PR also pre-computes the costs for branches (except the latch) instead of computing their costs as part of costing of replicate regions, as there may not be a direct correspondence between original branches and number of replicate regions. Original message: This adds a new interface to compute the cost of recipes, VPBasicBlocks, VPRegionBlocks and VPlan, initially falling back to the legacy cost model for all recipes. Follow-up patches will gradually migrate recipes to compute their own costs step-by-step. It also adds getBestPlan function to LVP which computes the cost of all VPlans and picks the most profitable one together with the most profitable VF. The VPlan selected by the VPlan cost model is executed and there is an assert to catch cases where the VPlan cost model and the legacy cost model disagree. Even though I checked a number of different build configurations on AArch64 and X86, there may be some differences that have been missed. Additional discussions and context can be found in @arcbbb's #67647 and #67934 which is an earlier version of the current PR. PR: #92555
1 parent 62b3e68 commit b841e2e

File tree

9 files changed

+403
-27
lines changed

9 files changed

+403
-27
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,16 @@ class LoopVectorizationPlanner {
334334
/// A builder used to construct the current plan.
335335
VPBuilder Builder;
336336

337+
/// Computes the cost of \p Plan for vectorization factor \p VF.
338+
///
339+
/// The current implementation requires access to the
340+
/// LoopVectorizationLegality to handle inductions and reductions, which is
341+
/// why it is kept separate from the VPlan-only cost infrastructure.
342+
///
343+
/// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
344+
/// been retired.
345+
InstructionCost cost(VPlan &Plan, ElementCount VF) const;
346+
337347
public:
338348
LoopVectorizationPlanner(
339349
Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -355,6 +365,9 @@ class LoopVectorizationPlanner {
355365
/// Return the best VPlan for \p VF.
356366
VPlan &getBestPlanFor(ElementCount VF) const;
357367

368+
/// Return the most profitable plan and fix its VF to the most profitable one.
369+
VPlan &getBestPlan() const;
370+
358371
/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
359372
/// according to the best selected \p VF and \p UF.
360373
///
@@ -434,6 +447,9 @@ class LoopVectorizationPlanner {
434447

435448
/// \return The most profitable vectorization factor for the available VPlans
436449
/// and the cost of that VF.
450+
/// This is now only used to verify the decisions by the new VPlan-based
451+
/// cost-model and will be retired once the VPlan-based cost-model is
452+
/// stabilized.
437453
VectorizationFactor selectVectorizationFactor();
438454

439455
/// Returns true if the per-lane cost of VectorizationFactor A is lower than

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 229 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
290290
cl::desc("A flag that overrides the target's max interleave factor for "
291291
"vectorized loops."));
292292

293-
static cl::opt<unsigned> ForceTargetInstructionCost(
293+
cl::opt<unsigned> ForceTargetInstructionCost(
294294
"force-target-instruction-cost", cl::init(0), cl::Hidden,
295295
cl::desc("A flag that overrides the target's expected cost for "
296296
"an instruction to a single constant value. Mostly "
@@ -412,14 +412,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
412412
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
413413
}
414414

415-
/// A helper function that returns the reciprocal of the block probability of
416-
/// predicated blocks. If we return X, we are assuming the predicated block
417-
/// will execute once for every X iterations of the loop header.
418-
///
419-
/// TODO: We should use actual block probability here, if available. Currently,
420-
/// we always assume predicated blocks have a 50% chance of executing.
421-
static unsigned getReciprocalPredBlockProb() { return 2; }
422-
423415
/// Returns "best known" trip count for the specified loop \p L as defined by
424416
/// the following procedure:
425417
/// 1) Returns exact trip count if it is known.
@@ -1608,6 +1600,16 @@ class LoopVectorizationCostModel {
16081600
/// \p VF is the vectorization factor chosen for the original loop.
16091601
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
16101602

1603+
/// Returns the execution time cost of an instruction for a given vector
1604+
/// width. Vector width of one means scalar.
1605+
InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1606+
1607+
/// Return the cost of instructions in an inloop reduction pattern, if I is
1608+
/// part of that pattern.
1609+
std::optional<InstructionCost>
1610+
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1611+
TTI::TargetCostKind CostKind) const;
1612+
16111613
private:
16121614
unsigned NumPredStores = 0;
16131615

@@ -1633,16 +1635,6 @@ class LoopVectorizationCostModel {
16331635
/// of elements.
16341636
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
16351637

1636-
/// Returns the execution time cost of an instruction for a given vector
1637-
/// width. Vector width of one means scalar.
1638-
InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1639-
1640-
/// Return the cost of instructions in an inloop reduction pattern, if I is
1641-
/// part of that pattern.
1642-
std::optional<InstructionCost>
1643-
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1644-
TTI::TargetCostKind CostKind) const;
1645-
16461638
/// Calculate vectorization cost of memory instruction \p I.
16471639
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
16481640

@@ -7288,7 +7280,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
72887280
[](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
72897281
return VectorizationFactor::Disabled();
72907282

7291-
// Select the optimal vectorization factor.
7283+
// Select the optimal vectorization factor according to the legacy cost-model.
7284+
// This is now only used to verify the decisions by the new VPlan-based
7285+
// cost-model and will be retired once the VPlan-based cost-model is
7286+
// stabilized.
72927287
VectorizationFactor VF = selectVectorizationFactor();
72937288
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
72947289
if (!hasPlanWithVF(VF.Width)) {
@@ -7299,6 +7294,211 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
72997294
return VF;
73007295
}
73017296

7297+
InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7298+
ElementCount VF) const {
7299+
return CM.getInstructionCost(UI, VF);
7300+
}
7301+
7302+
bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7303+
return CM.ValuesToIgnore.contains(UI) ||
7304+
(IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7305+
SkipCostComputation.contains(UI);
7306+
}
7307+
7308+
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7309+
ElementCount VF) const {
7310+
InstructionCost Cost = 0;
7311+
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7312+
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
7313+
7314+
// Cost modeling for inductions is inaccurate in the legacy cost model
7315+
// compared to the recipes that are generated. To match here initially during
7316+
// VPlan cost model bring up directly use the induction costs from the legacy
7317+
// cost model. Note that we do this as pre-processing; the VPlan may not have
7318+
// any recipes associated with the original induction increment instruction
7319+
// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7320+
// the cost of induction phis and increments (both that are represented by
7321+
// recipes and those that are not), to avoid distinguishing between them here,
7322+
// and skip all recipes that represent induction phis and increments (the
7323+
// former case) later on, if they exist, to avoid counting them twice.
7324+
// Similarly we pre-compute the cost of any optimized truncates.
7325+
// TODO: Switch to more accurate costing based on VPlan.
7326+
for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7327+
Instruction *IVInc = cast<Instruction>(
7328+
IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7329+
SmallVector<Instruction *> IVInsts = {IV, IVInc};
7330+
for (User *U : IV->users()) {
7331+
auto *CI = cast<Instruction>(U);
7332+
if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7333+
continue;
7334+
IVInsts.push_back(CI);
7335+
}
7336+
for (Instruction *IVInst : IVInsts) {
7337+
if (!CostCtx.SkipCostComputation.insert(IVInst).second)
7338+
continue;
7339+
InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7340+
LLVM_DEBUG({
7341+
dbgs() << "Cost of " << InductionCost << " for VF " << VF
7342+
<< ": induction instruction " << *IVInst << "\n";
7343+
});
7344+
Cost += InductionCost;
7345+
}
7346+
}
7347+
7348+
/// Compute the cost of all exiting conditions of the loop using the legacy
7349+
/// cost model. This is to match the legacy behavior, which adds the cost of
7350+
/// all exit conditions. Note that this over-estimates the cost, as there will
7351+
/// be a single condition to control the vector loop.
7352+
SmallVector<BasicBlock *> Exiting;
7353+
CM.TheLoop->getExitingBlocks(Exiting);
7354+
SetVector<Instruction *> ExitInstrs;
7355+
// Collect all exit conditions.
7356+
for (BasicBlock *EB : Exiting) {
7357+
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7358+
if (!Term)
7359+
continue;
7360+
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7361+
ExitInstrs.insert(CondI);
7362+
}
7363+
}
7364+
// Compute the cost of all instructions only feeding the exit conditions.
7365+
for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7366+
Instruction *CondI = ExitInstrs[I];
7367+
if (!OrigLoop->contains(CondI) ||
7368+
!CostCtx.SkipCostComputation.insert(CondI).second)
7369+
continue;
7370+
Cost += CostCtx.getLegacyCost(CondI, VF);
7371+
for (Value *Op : CondI->operands()) {
7372+
auto *OpI = dyn_cast<Instruction>(Op);
7373+
if (!OpI || any_of(OpI->users(), [&ExitInstrs](User *U) {
7374+
return !ExitInstrs.contains(cast<Instruction>(U));
7375+
}))
7376+
continue;
7377+
ExitInstrs.insert(OpI);
7378+
}
7379+
}
7380+
7381+
// The legacy cost model has special logic to compute the cost of in-loop
7382+
// reductions, which may be smaller than the sum of all instructions involved
7383+
// in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7384+
// which the legacy cost model uses to assign cost. Pre-compute their costs
7385+
// for now.
7386+
// TODO: Switch to costing based on VPlan once the logic has been ported.
7387+
for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7388+
if (!CM.isInLoopReduction(RedPhi) &&
7389+
!RecurrenceDescriptor::isAnyOfRecurrenceKind(
7390+
RdxDesc.getRecurrenceKind()))
7391+
continue;
7392+
7393+
// AnyOf reduction codegen may remove the select. To match the legacy cost
7394+
// model, pre-compute the cost for AnyOf reductions here.
7395+
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7396+
RdxDesc.getRecurrenceKind())) {
7397+
auto *Select = cast<SelectInst>(*find_if(
7398+
RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7399+
assert(!CostCtx.SkipCostComputation.contains(Select) &&
7400+
"reduction op visited multiple times");
7401+
CostCtx.SkipCostComputation.insert(Select);
7402+
auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7403+
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7404+
<< ":\n any-of reduction " << *Select << "\n");
7405+
Cost += ReductionCost;
7406+
continue;
7407+
}
7408+
7409+
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7410+
SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7411+
ChainOps.end());
7412+
// Also include the operands of instructions in the chain, as the cost-model
7413+
// may mark extends as free.
7414+
for (auto *ChainOp : ChainOps) {
7415+
for (Value *Op : ChainOp->operands()) {
7416+
if (auto *I = dyn_cast<Instruction>(Op))
7417+
ChainOpsAndOperands.insert(I);
7418+
}
7419+
}
7420+
7421+
// Pre-compute the cost for I, if it has a reduction pattern cost.
7422+
for (Instruction *I : ChainOpsAndOperands) {
7423+
auto ReductionCost = CM.getReductionPatternCost(
7424+
I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7425+
if (!ReductionCost)
7426+
continue;
7427+
7428+
assert(!CostCtx.SkipCostComputation.contains(I) &&
7429+
"reduction op visited multiple times");
7430+
CostCtx.SkipCostComputation.insert(I);
7431+
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7432+
<< ":\n in-loop reduction " << *I << "\n");
7433+
Cost += *ReductionCost;
7434+
}
7435+
}
7436+
7437+
// Pre-compute the costs for branches except for the backedge, as the number
7438+
// of replicate regions in a VPlan may not directly match the number of
7439+
// branches, which would lead to different decisions.
7440+
// TODO: Compute cost of branches for each replicate region in the VPlan,
7441+
// which is more accurate than the legacy cost model.
7442+
for (BasicBlock *BB : OrigLoop->blocks()) {
7443+
if (BB == OrigLoop->getLoopLatch())
7444+
continue;
7445+
CostCtx.SkipCostComputation.insert(BB->getTerminator());
7446+
auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7447+
Cost += BranchCost;
7448+
}
7449+
// Now compute and add the VPlan-based cost.
7450+
Cost += Plan.cost(VF, CostCtx);
7451+
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7452+
return Cost;
7453+
}
7454+
7455+
VPlan &LoopVectorizationPlanner::getBestPlan() const {
7456+
// If there is a single VPlan with a single VF, return it directly.
7457+
VPlan &FirstPlan = *VPlans[0];
7458+
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7459+
return FirstPlan;
7460+
7461+
VPlan *BestPlan = &FirstPlan;
7462+
ElementCount ScalarVF = ElementCount::getFixed(1);
7463+
assert(hasPlanWithVF(ScalarVF) &&
7464+
"More than a single plan/VF w/o any plan having scalar VF");
7465+
7466+
// TODO: Compute scalar cost using VPlan-based cost model.
7467+
InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7468+
VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7469+
7470+
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7471+
if (ForceVectorization) {
7472+
// Ignore scalar width, because the user explicitly wants vectorization.
7473+
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
7474+
// evaluation.
7475+
BestFactor.Cost = InstructionCost::getMax();
7476+
}
7477+
7478+
for (auto &P : VPlans) {
7479+
for (ElementCount VF : P->vectorFactors()) {
7480+
if (VF.isScalar())
7481+
continue;
7482+
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7483+
LLVM_DEBUG(
7484+
dbgs()
7485+
<< "LV: Not considering vector loop of width " << VF
7486+
<< " because it will not generate any vector instructions.\n");
7487+
continue;
7488+
}
7489+
7490+
InstructionCost Cost = cost(*P, VF);
7491+
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7492+
if (isMoreProfitable(CurrentFactor, BestFactor)) {
7493+
BestFactor = CurrentFactor;
7494+
BestPlan = &*P;
7495+
}
7496+
}
7497+
}
7498+
BestPlan->setVF(BestFactor.Width);
7499+
return *BestPlan;
7500+
}
7501+
73027502
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
73037503
assert(count_if(VPlans,
73047504
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10169,8 +10369,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1016910369
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
1017010370
PSI, Checks);
1017110371

10172-
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10173-
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10372+
VPlan &BestPlan = LVP.getBestPlan();
10373+
assert(size(BestPlan.vectorFactors()) == 1 &&
10374+
"Plan should have a single VF");
10375+
ElementCount Width = *BestPlan.vectorFactors().begin();
10376+
LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
10377+
<< "\n");
10378+
assert(VF.Width == Width &&
10379+
"VPlan cost model and legacy cost model disagreed");
10380+
LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
1017410381
++LoopsVectorized;
1017510382

1017610383
// Add metadata to disable runtime unrolling a scalar loop when there

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,48 @@ void VPRegionBlock::execute(VPTransformState *State) {
776776
State->Instance.reset();
777777
}
778778

779+
InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
780+
InstructionCost Cost = 0;
781+
for (VPRecipeBase &R : Recipes)
782+
Cost += R.cost(VF, Ctx);
783+
return Cost;
784+
}
785+
786+
InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
787+
if (!isReplicator()) {
788+
InstructionCost Cost = 0;
789+
for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
790+
Cost += Block->cost(VF, Ctx);
791+
InstructionCost BackedgeCost =
792+
Ctx.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
793+
LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
794+
<< ": vector loop backedge\n");
795+
Cost += BackedgeCost;
796+
return Cost;
797+
}
798+
799+
// Compute the cost of a replicate region. Replicating isn't supported for
800+
// scalable vectors, return an invalid cost for them.
801+
// TODO: Discard scalable VPlans with replicate recipes earlier after
802+
// construction.
803+
if (VF.isScalable())
804+
return InstructionCost::getInvalid();
805+
806+
// First compute the cost of the conditionally executed recipes, followed by
807+
// account for the branching cost, except if the mask is a header mask or
808+
// uniform condition.
809+
using namespace llvm::VPlanPatternMatch;
810+
VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
811+
InstructionCost ThenCost = Then->cost(VF, Ctx);
812+
813+
// For the scalar case, we may not always execute the original predicated
814+
// block, Thus, scale the block's cost by the probability of executing it.
815+
if (VF.isScalar())
816+
return ThenCost / getReciprocalPredBlockProb();
817+
818+
return ThenCost;
819+
}
820+
779821
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
780822
void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
781823
VPSlotTracker &SlotTracker) const {
@@ -1040,6 +1082,12 @@ void VPlan::execute(VPTransformState *State) {
10401082
"DT not preserved correctly");
10411083
}
10421084

1085+
InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
1086+
// For now only return the cost of the vector loop region, ignoring any other
1087+
// blocks, like the preheader or middle blocks.
1088+
return getVectorLoopRegion()->cost(VF, Ctx);
1089+
}
1090+
10431091
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
10441092
void VPlan::printLiveIns(raw_ostream &O) const {
10451093
VPSlotTracker SlotTracker(this);

0 commit comments

Comments
 (0)