-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[VPlan] First step towards VPlan cost modeling. #92555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
98230db
6330a67
0da9e25
52786ae
7043085
d2fa5ee
b1ab1b8
c91f8ba
e1cd132
e66563b
9a4111d
faa855d
860aae1
32fc296
17442f9
b27201c
24e03bd
1ae4d60
423adca
8ff3109
f49ed3f
204dfaf
389e841
2c3e408
9c69bfb
7b7581b
de59992
f5f3581
d13777c
9c99b10
bd14e40
b316c55
692a55c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( | |||||
cl::desc("A flag that overrides the target's max interleave factor for " | ||||||
"vectorized loops.")); | ||||||
|
||||||
static cl::opt<unsigned> ForceTargetInstructionCost( | ||||||
cl::opt<unsigned> ForceTargetInstructionCost( | ||||||
"force-target-instruction-cost", cl::init(0), cl::Hidden, | ||||||
cl::desc("A flag that overrides the target's expected cost for " | ||||||
"an instruction to a single constant value. Mostly " | ||||||
|
@@ -412,14 +412,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { | |||||
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); | ||||||
} | ||||||
|
||||||
/// A helper function that returns the reciprocal of the block probability of | ||||||
/// predicated blocks. If we return X, we are assuming the predicated block | ||||||
/// will execute once for every X iterations of the loop header. | ||||||
/// | ||||||
/// TODO: We should use actual block probability here, if available. Currently, | ||||||
/// we always assume predicated blocks have a 50% chance of executing. | ||||||
static unsigned getReciprocalPredBlockProb() { return 2; } | ||||||
|
||||||
/// Returns "best known" trip count for the specified loop \p L as defined by | ||||||
/// the following procedure: | ||||||
/// 1) Returns exact trip count if it is known. | ||||||
|
@@ -1621,6 +1613,16 @@ class LoopVectorizationCostModel { | |||||
/// \p VF is the vectorization factor chosen for the original loop. | ||||||
bool isEpilogueVectorizationProfitable(const ElementCount VF) const; | ||||||
|
||||||
/// Return the cost of instructions in an inloop reduction pattern, if I is | ||||||
/// part of that pattern. | ||||||
std::optional<InstructionCost> | ||||||
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better called getInLoopReductionPatternCost()? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will adjust separately. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Very well. Another suggestion is to use Invalid cost for "no cost" instead of optional. |
||||||
TTI::TargetCostKind CostKind) const; | ||||||
|
||||||
/// Returns the execution time cost of an instruction for a given vector | ||||||
/// width. Vector width of one means scalar. | ||||||
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); | ||||||
|
||||||
private: | ||||||
unsigned NumPredStores = 0; | ||||||
|
||||||
|
@@ -1646,21 +1648,11 @@ class LoopVectorizationCostModel { | |||||
/// of elements. | ||||||
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); | ||||||
|
||||||
/// Returns the execution time cost of an instruction for a given vector | ||||||
/// width. Vector width of one means scalar. | ||||||
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); | ||||||
|
||||||
/// The cost-computation logic from getInstructionCost which provides | ||||||
/// the vector type as an output parameter. | ||||||
InstructionCost getInstructionCost(Instruction *I, ElementCount VF, | ||||||
Type *&VectorTy); | ||||||
|
||||||
/// Return the cost of instructions in an inloop reduction pattern, if I is | ||||||
/// part of that pattern. | ||||||
std::optional<InstructionCost> | ||||||
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, | ||||||
TTI::TargetCostKind CostKind) const; | ||||||
|
||||||
/// Calculate vectorization cost of memory instruction \p I. | ||||||
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); | ||||||
|
||||||
|
@@ -7288,7 +7280,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { | |||||
if (!MaxFactors.hasVector()) | ||||||
return VectorizationFactor::Disabled(); | ||||||
|
||||||
// Select the optimal vectorization factor. | ||||||
// Select the optimal vectorization factor according to the legacy cost-model. | ||||||
// This is now only used to verify the decisions by the new VPlan-based | ||||||
// cost-model and will be retired once the VPlan-based cost-model is | ||||||
// stabilized. | ||||||
VectorizationFactor VF = selectVectorizationFactor(VFCandidates); | ||||||
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); | ||||||
if (!hasPlanWithVF(VF.Width)) { | ||||||
|
@@ -7299,6 +7294,166 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { | |||||
return VF; | ||||||
} | ||||||
|
||||||
InstructionCost VPCostContext::getLegacyCost(Instruction *UI, | ||||||
ElementCount VF) const { | ||||||
return CM.getInstructionCost(UI, VF).first; | ||||||
} | ||||||
|
||||||
bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { | ||||||
return (IsVector && CM.VecValuesToIgnore.contains(UI)) || | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The legacy cost of recipes (and VF>1) whose underlying instruction belongs to VecValuesToIgnore should be zero? In which case checking them here is redundant. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the moment, VecValuesToIgnore is used to skip instructions while iterating over them in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, and these cost-ignorant Instructions may end up having recipes. |
||||||
SkipCostComputation.contains(UI); | ||||||
} | ||||||
|
||||||
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, | ||||||
ElementCount VF) const { | ||||||
InstructionCost Cost = 0; | ||||||
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); | ||||||
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM); | ||||||
|
||||||
// Cost modeling for inductions is inaccurate in the legacy cost model | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth indicating that this is restricted to the cost of the induction bump only. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added as below, thanks! |
||||||
// compared to the recipes that are generated. To match here initially during | ||||||
// VPlan cost model bring up directly use the induction costs from the legacy | ||||||
// cost model. Note that we do this as pre-processing; the VPlan may not have | ||||||
// any recipes associated with the original induction increment instruction. | ||||||
// We precompute the cost of both induction increment instructions that are | ||||||
// represented by recipes and those that are not, to avoid distinguishing | ||||||
// between them here, and skip all recipes that represent induction increments | ||||||
// (the former case) later on, if they exist, to avoid counting them twice. | ||||||
// TODO: Switch to more accurate costing based on VPlan. | ||||||
for (const auto &[IV, _] : Legal->getInductionVars()) { | ||||||
Instruction *IVInc = cast<Instruction>( | ||||||
IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); | ||||||
assert(!CostCtx.SkipCostComputation.contains(IVInc) && | ||||||
"Same IV increment for multiple inductions?"); | ||||||
CostCtx.SkipCostComputation.insert(IVInc); | ||||||
InstructionCost InductionCost = CostCtx.getLegacyCost(IVInc, VF); | ||||||
LLVM_DEBUG({ | ||||||
dbgs() << "Cost of " << InductionCost << " for VF " << VF | ||||||
<< ":\n induction increment " << *IVInc << "\n"; | ||||||
IVInc->dump(); | ||||||
}); | ||||||
Cost += InductionCost; | ||||||
} | ||||||
|
||||||
/// Compute the cost of all exiting conditions of the loop using the legacy | ||||||
/// cost model. This is to match the legacy behavior, which adds the cost of | ||||||
/// all exit conditions. Note that this over-estimates the cost, as there will | ||||||
/// be a single condition to control the vector loop. | ||||||
SmallVector<BasicBlock *> Exiting; | ||||||
CM.TheLoop->getExitingBlocks(Exiting); | ||||||
// Add the cost of all exit conditions. | ||||||
for (BasicBlock *EB : Exiting) { | ||||||
auto *Term = dyn_cast<BranchInst>(EB->getTerminator()); | ||||||
if (!Term) | ||||||
continue; | ||||||
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) { | ||||||
assert(!CostCtx.SkipCostComputation.contains(CondI) && | ||||||
"Condition already skipped?"); | ||||||
CostCtx.SkipCostComputation.insert(CondI); | ||||||
Cost += CostCtx.getLegacyCost(CondI, VF); | ||||||
} | ||||||
} | ||||||
|
||||||
// The legacy cost model has special logic to compute the cost of in-loop | ||||||
// reductions, which may be smaller than the sum of all instructions involved | ||||||
// in the reduction. For AnyOf reductions, VPlan codegen may remove the select | ||||||
// which the legacy cost model uses to assign cost. Pre-compute their costs | ||||||
// for now. | ||||||
// TODO: Switch to costing based on VPlan once the logic has been ported. | ||||||
for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { | ||||||
if (!CM.isInLoopReduction(RedPhi) && | ||||||
!RecurrenceDescriptor::isAnyOfRecurrenceKind( | ||||||
RdxDesc.getRecurrenceKind())) | ||||||
continue; | ||||||
|
||||||
// AnyOf reduction codegen may remove the select. To match the legacy cost | ||||||
// model, pre-compute the cost for AnyOf reductions here. | ||||||
if (RecurrenceDescriptor::isAnyOfRecurrenceKind( | ||||||
RdxDesc.getRecurrenceKind())) { | ||||||
auto *Select = cast<SelectInst>(*find_if( | ||||||
RedPhi->users(), [](User *U) { return isa<SelectInst>(U); })); | ||||||
assert(!CostCtx.SkipCostComputation.contains(Select) && | ||||||
"reduction op visited multiple times"); | ||||||
CostCtx.SkipCostComputation.insert(Select); | ||||||
auto ReductionCost = CostCtx.getLegacyCost(Select, VF); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Call CM.geInstructionCost() directly, as above? Or change above to call CostCtx.getLegacyCost(). Better be consistent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated to use CostCtx.getLegacyCost(), thanks! |
||||||
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF | ||||||
<< ":\n any-of reduction " << *Select << "\n"); | ||||||
Cost += ReductionCost; | ||||||
continue; | ||||||
} | ||||||
|
||||||
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); | ||||||
SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), | ||||||
ChainOps.end()); | ||||||
// Also include the operands of instructions in the chain, as the cost-model | ||||||
// may mark extends as free. | ||||||
for (auto *ChainOp : ChainOps) { | ||||||
for (Value *Op : ChainOp->operands()) { | ||||||
if (auto *I = dyn_cast<Instruction>(Op)) | ||||||
ChainOpsAndOperands.insert(I); | ||||||
} | ||||||
} | ||||||
|
||||||
// Pre-compute the cost for I, if it has a reduction pattern cost. | ||||||
for (Instruction *I : ChainOpsAndOperands) { | ||||||
auto ReductionCost = CM.getReductionPatternCost( | ||||||
I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth a comment that we precompute the cost of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, thanks. |
||||||
if (!ReductionCost) | ||||||
continue; | ||||||
|
||||||
assert(!CostCtx.SkipCostComputation.contains(I) && | ||||||
"reduction op visited multiple times"); | ||||||
CostCtx.SkipCostComputation.insert(I); | ||||||
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF | ||||||
<< ":\n in-loop reduction " << *I << "\n"); | ||||||
Cost += *ReductionCost; | ||||||
} | ||||||
} | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth emphasizing that
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, thanks! |
||||||
// Now compute and add the VPlan-based cost. | ||||||
Cost += Plan.cost(VF, CostCtx); | ||||||
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n"); | ||||||
return Cost; | ||||||
} | ||||||
|
||||||
VPlan &LoopVectorizationPlanner::getBestPlan() const { | ||||||
// If there is a single VPlan with a single VF, return it directly. | ||||||
VPlan &FirstPlan = *VPlans[0]; | ||||||
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) | ||||||
return FirstPlan; | ||||||
|
||||||
VPlan *BestPlan = &FirstPlan; | ||||||
ElementCount ScalarVF = ElementCount::getFixed(1); | ||||||
assert(hasPlanWithVF(ScalarVF) && | ||||||
"More than a single plan/VF w/o any plan having scalar VF"); | ||||||
|
||||||
InstructionCost ScalarCost = cost(getBestPlanFor(ScalarVF), ScalarVF); | ||||||
VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost); | ||||||
|
||||||
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; | ||||||
if (ForceVectorization) { | ||||||
// Ignore scalar width, because the user explicitly wants vectorization. | ||||||
// Initialize cost to max so that VF = 2 is, at least, chosen during cost | ||||||
// evaluation. | ||||||
BestFactor.Cost = InstructionCost::getMax(); | ||||||
} | ||||||
|
||||||
for (auto &P : VPlans) { | ||||||
for (ElementCount VF : P->vectorFactors()) { | ||||||
if (VF.isScalar()) | ||||||
continue; | ||||||
InstructionCost Cost = cost(*P, VF); | ||||||
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); | ||||||
if (isMoreProfitable(CurrentFactor, BestFactor)) { | ||||||
BestFactor = CurrentFactor; | ||||||
BestPlan = &*P; | ||||||
} | ||||||
} | ||||||
} | ||||||
BestPlan->setVF(BestFactor.Width); | ||||||
return *BestPlan; | ||||||
} | ||||||
|
||||||
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { | ||||||
assert(count_if(VPlans, | ||||||
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == | ||||||
|
@@ -10157,8 +10312,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |||||
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, | ||||||
PSI, Checks); | ||||||
|
||||||
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); | ||||||
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); | ||||||
VPlan &BestPlan = LVP.getBestPlan(); | ||||||
assert(size(BestPlan.vectorFactors()) == 1 && | ||||||
"Plan should have a single VF"); | ||||||
ElementCount Width = *BestPlan.vectorFactors().begin(); | ||||||
LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width | ||||||
<< "\n"); | ||||||
assert(VF.Width == Width && | ||||||
"VPlan cost model and legacy cost model disagreed"); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth adding a comment in LVP::selectVectorizationFactor(), which selects the best VF based on legacy cost model, that it is destined to retire once computing the best VF based on VPlan costs is confirmed to agree and stabilizes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a comment to both the call site and header for selectVectorizationFactor; with this patch, it is only used to cross-check the VPlan-based one, but the VPlan-based one will pick the plan to execute via getBestPlan in the main code vector code path (epilogue vectorization code path is not updated yet) |
||||||
LVP.executePlan(Width, IC, BestPlan, LB, DT, false); | ||||||
++LoopsVectorized; | ||||||
|
||||||
// Add metadata to disable runtime unrolling a scalar loop when there | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(unrelated to this patch).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will adjust separately.