@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
290
290
cl::desc(" A flag that overrides the target's max interleave factor for "
291
291
" vectorized loops." ));
292
292
293
- cl::opt<unsigned> ForceTargetInstructionCost(
293
+ static cl::opt<unsigned > ForceTargetInstructionCost (
294
294
" force-target-instruction-cost" , cl::init(0 ), cl::Hidden,
295
295
cl::desc(" A flag that overrides the target's expected cost for "
296
296
" an instruction to a single constant value. Mostly "
@@ -412,6 +412,14 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
412
412
return DL.getTypeAllocSizeInBits (Ty) != DL.getTypeSizeInBits (Ty);
413
413
}
414
414
415
+ // / A helper function that returns the reciprocal of the block probability of
416
+ // / predicated blocks. If we return X, we are assuming the predicated block
417
+ // / will execute once for every X iterations of the loop header.
418
+ // /
419
+ // / TODO: We should use actual block probability here, if available. Currently,
420
+ // / we always assume predicated blocks have a 50% chance of executing.
421
+ static unsigned getReciprocalPredBlockProb () { return 2 ; }
422
+
415
423
// / Returns "best known" trip count for the specified loop \p L as defined by
416
424
// / the following procedure:
417
425
// / 1) Returns exact trip count if it is known.
@@ -1613,16 +1621,6 @@ class LoopVectorizationCostModel {
1613
1621
// / \p VF is the vectorization factor chosen for the original loop.
1614
1622
bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
1615
1623
1616
- /// Return the cost of instructions in an inloop reduction pattern, if I is
1617
- /// part of that pattern.
1618
- std::optional<InstructionCost>
1619
- getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1620
- TTI::TargetCostKind CostKind) const;
1621
-
1622
- /// Returns the execution time cost of an instruction for a given vector
1623
- /// width. Vector width of one means scalar.
1624
- VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1625
-
1626
1624
private:
1627
1625
unsigned NumPredStores = 0 ;
1628
1626
@@ -1648,11 +1646,21 @@ class LoopVectorizationCostModel {
1648
1646
// / of elements.
1649
1647
ElementCount getMaxLegalScalableVF (unsigned MaxSafeElements);
1650
1648
1649
+ // / Returns the execution time cost of an instruction for a given vector
1650
+ // / width. Vector width of one means scalar.
1651
+ VectorizationCostTy getInstructionCost (Instruction *I, ElementCount VF);
1652
+
1651
1653
// / The cost-computation logic from getInstructionCost which provides
1652
1654
// / the vector type as an output parameter.
1653
1655
InstructionCost getInstructionCost (Instruction *I, ElementCount VF,
1654
1656
Type *&VectorTy);
1655
1657
1658
+ // / Return the cost of instructions in an inloop reduction pattern, if I is
1659
+ // / part of that pattern.
1660
+ std::optional<InstructionCost>
1661
+ getReductionPatternCost (Instruction *I, ElementCount VF, Type *VectorTy,
1662
+ TTI::TargetCostKind CostKind) const ;
1663
+
1656
1664
// / Calculate vectorization cost of memory instruction \p I.
1657
1665
InstructionCost getMemoryInstructionCost (Instruction *I, ElementCount VF);
1658
1666
@@ -7280,10 +7288,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7280
7288
if (!MaxFactors.hasVector ())
7281
7289
return VectorizationFactor::Disabled ();
7282
7290
7283
- // Select the optimal vectorization factor according to the legacy cost-model.
7284
- // This is now only used to verify the decisions by the new VPlan-based
7285
- // cost-model and will be retired once the VPlan-based cost-model is
7286
- // stabilized.
7291
+ // Select the optimal vectorization factor.
7287
7292
VectorizationFactor VF = selectVectorizationFactor (VFCandidates);
7288
7293
assert ((VF.Width .isScalar () || VF.ScalarCost > 0 ) && " when vectorizing, the scalar cost must be non-zero." );
7289
7294
if (!hasPlanWithVF (VF.Width )) {
@@ -7294,196 +7299,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7294
7299
return VF;
7295
7300
}
7296
7301
7297
- InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7298
- ElementCount VF) const {
7299
- return CM.getInstructionCost(UI, VF).first;
7300
- }
7301
-
7302
- bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7303
- return (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7304
- SkipCostComputation.contains(UI);
7305
- }
7306
-
7307
- InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7308
- ElementCount VF) const {
7309
- InstructionCost Cost = 0;
7310
- LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7311
- VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
7312
-
7313
- // Cost modeling for inductions is inaccurate in the legacy cost model
7314
- // compared to the recipes that are generated. To match here initially during
7315
- // VPlan cost model bring up directly use the induction costs from the legacy
7316
- // cost model. Note that we do this as pre-processing; the VPlan may not have
7317
- // any recipes associated with the original induction increment instruction
7318
- // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7319
- // the cost of both induction increment instructions that are represented by
7320
- // recipes and those that are not, to avoid distinguishing between them here,
7321
- // and skip all recipes that represent induction increments (the former case)
7322
- // later on, if they exist, to avoid counting them twice. Similarly we
7323
- // pre-compute the cost of any optimized truncates.
7324
- // TODO: Switch to more accurate costing based on VPlan.
7325
- for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7326
- Instruction *IVInc = cast<Instruction>(
7327
- IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7328
- if (CostCtx.SkipCostComputation.insert(IVInc).second) {
7329
- InstructionCost InductionCost = CostCtx.getLegacyCost(IVInc, VF);
7330
- LLVM_DEBUG({
7331
- dbgs() << "Cost of " << InductionCost << " for VF " << VF
7332
- << ":\n induction increment " << *IVInc << "\n";
7333
- IVInc->dump();
7334
- });
7335
- Cost += InductionCost;
7336
- }
7337
- for (User *U : IV->users()) {
7338
- auto *CI = cast<Instruction>(U);
7339
- if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7340
- continue;
7341
- assert(!CostCtx.SkipCostComputation.contains(CI) &&
7342
- "Same cast for multiple inductions?");
7343
- CostCtx.SkipCostComputation.insert(CI);
7344
- InstructionCost CastCost = CostCtx.getLegacyCost(CI, VF);
7345
- LLVM_DEBUG({
7346
- dbgs() << "Cost of " << CastCost << " for VF " << VF
7347
- << ":\n induction cast " << *CI << "\n";
7348
- CI->dump();
7349
- });
7350
- Cost += CastCost;
7351
- }
7352
- }
7353
-
7354
- /// Compute the cost of all exiting conditions of the loop using the legacy
7355
- /// cost model. This is to match the legacy behavior, which adds the cost of
7356
- /// all exit conditions. Note that this over-estimates the cost, as there will
7357
- /// be a single condition to control the vector loop.
7358
- SmallVector<BasicBlock *> Exiting;
7359
- CM.TheLoop->getExitingBlocks(Exiting);
7360
- SetVector<Instruction *> ExitInstrs;
7361
- // Collect all exit conditions.
7362
- for (BasicBlock *EB : Exiting) {
7363
- auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7364
- if (!Term)
7365
- continue;
7366
- if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7367
- ExitInstrs.insert(CondI);
7368
- }
7369
- }
7370
- // Compute the cost of all instructions only feeding the exit conditions.
7371
- for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7372
- Instruction *CondI = ExitInstrs[I];
7373
- if (!OrigLoop->contains(CondI) ||
7374
- !CostCtx.SkipCostComputation.insert(CondI).second)
7375
- continue;
7376
- Cost += CostCtx.getLegacyCost(CondI, VF);
7377
- for (Value *Op : CondI->operands()) {
7378
- auto *OpI = dyn_cast<Instruction>(Op);
7379
- if (!OpI || any_of(OpI->users(), [&ExitInstrs](User *U) {
7380
- return !ExitInstrs.contains(cast<Instruction>(U));
7381
- }))
7382
- continue;
7383
- ExitInstrs.insert(OpI);
7384
- }
7385
- }
7386
-
7387
- // The legacy cost model has special logic to compute the cost of in-loop
7388
- // reductions, which may be smaller than the sum of all instructions involved
7389
- // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7390
- // which the legacy cost model uses to assign cost. Pre-compute their costs
7391
- // for now.
7392
- // TODO: Switch to costing based on VPlan once the logic has been ported.
7393
- for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7394
- if (!CM.isInLoopReduction(RedPhi) &&
7395
- !RecurrenceDescriptor::isAnyOfRecurrenceKind(
7396
- RdxDesc.getRecurrenceKind()))
7397
- continue;
7398
-
7399
- // AnyOf reduction codegen may remove the select. To match the legacy cost
7400
- // model, pre-compute the cost for AnyOf reductions here.
7401
- if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7402
- RdxDesc.getRecurrenceKind())) {
7403
- auto *Select = cast<SelectInst>(*find_if(
7404
- RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7405
- assert(!CostCtx.SkipCostComputation.contains(Select) &&
7406
- "reduction op visited multiple times");
7407
- CostCtx.SkipCostComputation.insert(Select);
7408
- auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7409
- LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7410
- << ":\n any-of reduction " << *Select << "\n");
7411
- Cost += ReductionCost;
7412
- continue;
7413
- }
7414
-
7415
- const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7416
- SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7417
- ChainOps.end());
7418
- // Also include the operands of instructions in the chain, as the cost-model
7419
- // may mark extends as free.
7420
- for (auto *ChainOp : ChainOps) {
7421
- for (Value *Op : ChainOp->operands()) {
7422
- if (auto *I = dyn_cast<Instruction>(Op))
7423
- ChainOpsAndOperands.insert(I);
7424
- }
7425
- }
7426
-
7427
- // Pre-compute the cost for I, if it has a reduction pattern cost.
7428
- for (Instruction *I : ChainOpsAndOperands) {
7429
- auto ReductionCost = CM.getReductionPatternCost(
7430
- I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7431
- if (!ReductionCost)
7432
- continue;
7433
-
7434
- assert(!CostCtx.SkipCostComputation.contains(I) &&
7435
- "reduction op visited multiple times");
7436
- CostCtx.SkipCostComputation.insert(I);
7437
- LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7438
- << ":\n in-loop reduction " << *I << "\n");
7439
- Cost += *ReductionCost;
7440
- }
7441
- }
7442
-
7443
- // Now compute and add the VPlan-based cost.
7444
- Cost += Plan.cost(VF, CostCtx);
7445
- LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7446
- return Cost;
7447
- }
7448
-
7449
- VPlan &LoopVectorizationPlanner::getBestPlan() const {
7450
- // If there is a single VPlan with a single VF, return it directly.
7451
- VPlan &FirstPlan = *VPlans[0];
7452
- if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7453
- return FirstPlan;
7454
-
7455
- VPlan *BestPlan = &FirstPlan;
7456
- ElementCount ScalarVF = ElementCount::getFixed(1);
7457
- assert(hasPlanWithVF(ScalarVF) &&
7458
- "More than a single plan/VF w/o any plan having scalar VF");
7459
-
7460
- InstructionCost ScalarCost = cost(getBestPlanFor(ScalarVF), ScalarVF);
7461
- VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7462
-
7463
- bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7464
- if (ForceVectorization) {
7465
- // Ignore scalar width, because the user explicitly wants vectorization.
7466
- // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7467
- // evaluation.
7468
- BestFactor.Cost = InstructionCost::getMax();
7469
- }
7470
-
7471
- for (auto &P : VPlans) {
7472
- for (ElementCount VF : P->vectorFactors()) {
7473
- if (VF.isScalar())
7474
- continue;
7475
- InstructionCost Cost = cost(*P, VF);
7476
- VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7477
- if (isMoreProfitable(CurrentFactor, BestFactor)) {
7478
- BestFactor = CurrentFactor;
7479
- BestPlan = &*P;
7480
- }
7481
- }
7482
- }
7483
- BestPlan->setVF(BestFactor.Width);
7484
- return *BestPlan;
7485
- }
7486
-
7487
7302
VPlan &LoopVectorizationPlanner::getBestPlanFor (ElementCount VF) const {
7488
7303
assert (count_if (VPlans,
7489
7304
[VF](const VPlanPtr &Plan) { return Plan->hasVF (VF); }) ==
@@ -10342,15 +10157,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10342
10157
VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10343
10158
PSI, Checks);
10344
10159
10345
- VPlan &BestPlan = LVP.getBestPlan();
10346
- assert(size(BestPlan.vectorFactors()) == 1 &&
10347
- "Plan should have a single VF");
10348
- ElementCount Width = *BestPlan.vectorFactors().begin();
10349
- LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
10350
- << "\n");
10351
- assert(VF.Width == Width &&
10352
- "VPlan cost model and legacy cost model disagreed");
10353
- LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
10160
+ VPlan &BestPlan = LVP.getBestPlanFor (VF.Width );
10161
+ LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10354
10162
++LoopsVectorized;
10355
10163
10356
10164
// Add metadata to disable runtime unrolling a scalar loop when there
0 commit comments