@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
290
290
cl::desc(" A flag that overrides the target's max interleave factor for "
291
291
" vectorized loops." ));
292
292
293
- static cl::opt<unsigned > ForceTargetInstructionCost (
293
+ cl::opt<unsigned > ForceTargetInstructionCost (
294
294
" force-target-instruction-cost" , cl::init(0 ), cl::Hidden,
295
295
cl::desc(" A flag that overrides the target's expected cost for "
296
296
" an instruction to a single constant value. Mostly "
@@ -412,14 +412,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
412
412
return DL.getTypeAllocSizeInBits (Ty) != DL.getTypeSizeInBits (Ty);
413
413
}
414
414
415
- // / A helper function that returns the reciprocal of the block probability of
416
- // / predicated blocks. If we return X, we are assuming the predicated block
417
- // / will execute once for every X iterations of the loop header.
418
- // /
419
- // / TODO: We should use actual block probability here, if available. Currently,
420
- // / we always assume predicated blocks have a 50% chance of executing.
421
- static unsigned getReciprocalPredBlockProb () { return 2 ; }
422
-
423
415
// / Returns "best known" trip count for the specified loop \p L as defined by
424
416
// / the following procedure:
425
417
// / 1) Returns exact trip count if it is known.
@@ -1608,6 +1600,16 @@ class LoopVectorizationCostModel {
1608
1600
// / \p VF is the vectorization factor chosen for the original loop.
1609
1601
bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
1610
1602
1603
+ // / Returns the execution time cost of an instruction for a given vector
1604
+ // / width. Vector width of one means scalar.
1605
+ InstructionCost getInstructionCost (Instruction *I, ElementCount VF);
1606
+
1607
+ // / Return the cost of instructions in an inloop reduction pattern, if I is
1608
+ // / part of that pattern.
1609
+ std::optional<InstructionCost>
1610
+ getReductionPatternCost (Instruction *I, ElementCount VF, Type *VectorTy,
1611
+ TTI::TargetCostKind CostKind) const ;
1612
+
1611
1613
private:
1612
1614
unsigned NumPredStores = 0 ;
1613
1615
@@ -1633,16 +1635,6 @@ class LoopVectorizationCostModel {
1633
1635
// / of elements.
1634
1636
ElementCount getMaxLegalScalableVF (unsigned MaxSafeElements);
1635
1637
1636
- // / Returns the execution time cost of an instruction for a given vector
1637
- // / width. Vector width of one means scalar.
1638
- InstructionCost getInstructionCost (Instruction *I, ElementCount VF);
1639
-
1640
- // / Return the cost of instructions in an inloop reduction pattern, if I is
1641
- // / part of that pattern.
1642
- std::optional<InstructionCost>
1643
- getReductionPatternCost (Instruction *I, ElementCount VF, Type *VectorTy,
1644
- TTI::TargetCostKind CostKind) const ;
1645
-
1646
1638
// / Calculate vectorization cost of memory instruction \p I.
1647
1639
InstructionCost getMemoryInstructionCost (Instruction *I, ElementCount VF);
1648
1640
@@ -7288,7 +7280,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7288
7280
[](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly (); }))
7289
7281
return VectorizationFactor::Disabled ();
7290
7282
7291
- // Select the optimal vectorization factor.
7283
+ // Select the optimal vectorization factor according to the legacy cost-model.
7284
+ // This is now only used to verify the decisions by the new VPlan-based
7285
+ // cost-model and will be retired once the VPlan-based cost-model is
7286
+ // stabilized.
7292
7287
VectorizationFactor VF = selectVectorizationFactor ();
7293
7288
assert ((VF.Width .isScalar () || VF.ScalarCost > 0 ) && " when vectorizing, the scalar cost must be non-zero." );
7294
7289
if (!hasPlanWithVF (VF.Width )) {
@@ -7299,6 +7294,211 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7299
7294
return VF;
7300
7295
}
7301
7296
7297
+ InstructionCost VPCostContext::getLegacyCost (Instruction *UI,
7298
+ ElementCount VF) const {
7299
+ return CM.getInstructionCost (UI, VF);
7300
+ }
7301
+
7302
+ bool VPCostContext::skipCostComputation (Instruction *UI, bool IsVector) const {
7303
+ return CM.ValuesToIgnore .contains (UI) ||
7304
+ (IsVector && CM.VecValuesToIgnore .contains (UI)) ||
7305
+ SkipCostComputation.contains (UI);
7306
+ }
7307
+
7308
+ InstructionCost LoopVectorizationPlanner::cost (VPlan &Plan,
7309
+ ElementCount VF) const {
7310
+ InstructionCost Cost = 0 ;
7311
+ LLVMContext &LLVMCtx = OrigLoop->getHeader ()->getContext ();
7312
+ VPCostContext CostCtx (CM.TTI , Legal->getWidestInductionType (), LLVMCtx, CM);
7313
+
7314
+ // Cost modeling for inductions is inaccurate in the legacy cost model
7315
+ // compared to the recipes that are generated. To match here initially during
7316
+ // VPlan cost model bring up directly use the induction costs from the legacy
7317
+ // cost model. Note that we do this as pre-processing; the VPlan may not have
7318
+ // any recipes associated with the original induction increment instruction
7319
+ // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7320
+ // the cost of induction phis and increments (both that are represented by
7321
+ // recipes and those that are not), to avoid distinguishing between them here,
7322
+ // and skip all recipes that represent induction phis and increments (the
7323
+ // former case) later on, if they exist, to avoid counting them twice.
7324
+ // Similarly we pre-compute the cost of any optimized truncates.
7325
+ // TODO: Switch to more accurate costing based on VPlan.
7326
+ for (const auto &[IV, IndDesc] : Legal->getInductionVars ()) {
7327
+ Instruction *IVInc = cast<Instruction>(
7328
+ IV->getIncomingValueForBlock (OrigLoop->getLoopLatch ()));
7329
+ SmallVector<Instruction *> IVInsts = {IV, IVInc};
7330
+ for (User *U : IV->users ()) {
7331
+ auto *CI = cast<Instruction>(U);
7332
+ if (!CostCtx.CM .isOptimizableIVTruncate (CI, VF))
7333
+ continue ;
7334
+ IVInsts.push_back (CI);
7335
+ }
7336
+ for (Instruction *IVInst : IVInsts) {
7337
+ if (!CostCtx.SkipCostComputation .insert (IVInst).second )
7338
+ continue ;
7339
+ InstructionCost InductionCost = CostCtx.getLegacyCost (IVInst, VF);
7340
+ LLVM_DEBUG ({
7341
+ dbgs () << " Cost of " << InductionCost << " for VF " << VF
7342
+ << " : induction instruction " << *IVInst << " \n " ;
7343
+ });
7344
+ Cost += InductionCost;
7345
+ }
7346
+ }
7347
+
7348
+ // / Compute the cost of all exiting conditions of the loop using the legacy
7349
+ // / cost model. This is to match the legacy behavior, which adds the cost of
7350
+ // / all exit conditions. Note that this over-estimates the cost, as there will
7351
+ // / be a single condition to control the vector loop.
7352
+ SmallVector<BasicBlock *> Exiting;
7353
+ CM.TheLoop ->getExitingBlocks (Exiting);
7354
+ SetVector<Instruction *> ExitInstrs;
7355
+ // Collect all exit conditions.
7356
+ for (BasicBlock *EB : Exiting) {
7357
+ auto *Term = dyn_cast<BranchInst>(EB->getTerminator ());
7358
+ if (!Term)
7359
+ continue ;
7360
+ if (auto *CondI = dyn_cast<Instruction>(Term->getOperand (0 ))) {
7361
+ ExitInstrs.insert (CondI);
7362
+ }
7363
+ }
7364
+ // Compute the cost of all instructions only feeding the exit conditions.
7365
+ for (unsigned I = 0 ; I != ExitInstrs.size (); ++I) {
7366
+ Instruction *CondI = ExitInstrs[I];
7367
+ if (!OrigLoop->contains (CondI) ||
7368
+ !CostCtx.SkipCostComputation .insert (CondI).second )
7369
+ continue ;
7370
+ Cost += CostCtx.getLegacyCost (CondI, VF);
7371
+ for (Value *Op : CondI->operands ()) {
7372
+ auto *OpI = dyn_cast<Instruction>(Op);
7373
+ if (!OpI || any_of (OpI->users (), [&ExitInstrs](User *U) {
7374
+ return !ExitInstrs.contains (cast<Instruction>(U));
7375
+ }))
7376
+ continue ;
7377
+ ExitInstrs.insert (OpI);
7378
+ }
7379
+ }
7380
+
7381
+ // The legacy cost model has special logic to compute the cost of in-loop
7382
+ // reductions, which may be smaller than the sum of all instructions involved
7383
+ // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7384
+ // which the legacy cost model uses to assign cost. Pre-compute their costs
7385
+ // for now.
7386
+ // TODO: Switch to costing based on VPlan once the logic has been ported.
7387
+ for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars ()) {
7388
+ if (!CM.isInLoopReduction (RedPhi) &&
7389
+ !RecurrenceDescriptor::isAnyOfRecurrenceKind (
7390
+ RdxDesc.getRecurrenceKind ()))
7391
+ continue ;
7392
+
7393
+ // AnyOf reduction codegen may remove the select. To match the legacy cost
7394
+ // model, pre-compute the cost for AnyOf reductions here.
7395
+ if (RecurrenceDescriptor::isAnyOfRecurrenceKind (
7396
+ RdxDesc.getRecurrenceKind ())) {
7397
+ auto *Select = cast<SelectInst>(*find_if (
7398
+ RedPhi->users (), [](User *U) { return isa<SelectInst>(U); }));
7399
+ assert (!CostCtx.SkipCostComputation .contains (Select) &&
7400
+ " reduction op visited multiple times" );
7401
+ CostCtx.SkipCostComputation .insert (Select);
7402
+ auto ReductionCost = CostCtx.getLegacyCost (Select, VF);
7403
+ LLVM_DEBUG (dbgs () << " Cost of " << ReductionCost << " for VF " << VF
7404
+ << " :\n any-of reduction " << *Select << " \n " );
7405
+ Cost += ReductionCost;
7406
+ continue ;
7407
+ }
7408
+
7409
+ const auto &ChainOps = RdxDesc.getReductionOpChain (RedPhi, OrigLoop);
7410
+ SetVector<Instruction *> ChainOpsAndOperands (ChainOps.begin (),
7411
+ ChainOps.end ());
7412
+ // Also include the operands of instructions in the chain, as the cost-model
7413
+ // may mark extends as free.
7414
+ for (auto *ChainOp : ChainOps) {
7415
+ for (Value *Op : ChainOp->operands ()) {
7416
+ if (auto *I = dyn_cast<Instruction>(Op))
7417
+ ChainOpsAndOperands.insert (I);
7418
+ }
7419
+ }
7420
+
7421
+ // Pre-compute the cost for I, if it has a reduction pattern cost.
7422
+ for (Instruction *I : ChainOpsAndOperands) {
7423
+ auto ReductionCost = CM.getReductionPatternCost (
7424
+ I, VF, ToVectorTy (I->getType (), VF), TTI::TCK_RecipThroughput);
7425
+ if (!ReductionCost)
7426
+ continue ;
7427
+
7428
+ assert (!CostCtx.SkipCostComputation .contains (I) &&
7429
+ " reduction op visited multiple times" );
7430
+ CostCtx.SkipCostComputation .insert (I);
7431
+ LLVM_DEBUG (dbgs () << " Cost of " << ReductionCost << " for VF " << VF
7432
+ << " :\n in-loop reduction " << *I << " \n " );
7433
+ Cost += *ReductionCost;
7434
+ }
7435
+ }
7436
+
7437
+ // Pre-compute the costs for branches except for the backedge, as the number
7438
+ // of replicate regions in a VPlan may not directly match the number of
7439
+ // branches, which would lead to different decisions.
7440
+ // TODO: Compute cost of branches for each replicate region in the VPlan,
7441
+ // which is more accurate than the legacy cost model.
7442
+ for (BasicBlock *BB : OrigLoop->blocks ()) {
7443
+ if (BB == OrigLoop->getLoopLatch ())
7444
+ continue ;
7445
+ CostCtx.SkipCostComputation .insert (BB->getTerminator ());
7446
+ auto BranchCost = CostCtx.getLegacyCost (BB->getTerminator (), VF);
7447
+ Cost += BranchCost;
7448
+ }
7449
+ // Now compute and add the VPlan-based cost.
7450
+ Cost += Plan.cost (VF, CostCtx);
7451
+ LLVM_DEBUG (dbgs () << " Cost for VF " << VF << " : " << Cost << " \n " );
7452
+ return Cost;
7453
+ }
7454
+
7455
+ VPlan &LoopVectorizationPlanner::getBestPlan () const {
7456
+ // If there is a single VPlan with a single VF, return it directly.
7457
+ VPlan &FirstPlan = *VPlans[0 ];
7458
+ if (VPlans.size () == 1 && size (FirstPlan.vectorFactors ()) == 1 )
7459
+ return FirstPlan;
7460
+
7461
+ VPlan *BestPlan = &FirstPlan;
7462
+ ElementCount ScalarVF = ElementCount::getFixed (1 );
7463
+ assert (hasPlanWithVF (ScalarVF) &&
7464
+ " More than a single plan/VF w/o any plan having scalar VF" );
7465
+
7466
+ // TODO: Compute scalar cost using VPlan-based cost model.
7467
+ InstructionCost ScalarCost = CM.expectedCost (ScalarVF);
7468
+ VectorizationFactor BestFactor (ScalarVF, ScalarCost, ScalarCost);
7469
+
7470
+ bool ForceVectorization = Hints.getForce () == LoopVectorizeHints::FK_Enabled;
7471
+ if (ForceVectorization) {
7472
+ // Ignore scalar width, because the user explicitly wants vectorization.
7473
+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7474
+ // evaluation.
7475
+ BestFactor.Cost = InstructionCost::getMax ();
7476
+ }
7477
+
7478
+ for (auto &P : VPlans) {
7479
+ for (ElementCount VF : P->vectorFactors ()) {
7480
+ if (VF.isScalar ())
7481
+ continue ;
7482
+ if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
7483
+ LLVM_DEBUG (
7484
+ dbgs ()
7485
+ << " LV: Not considering vector loop of width " << VF
7486
+ << " because it will not generate any vector instructions.\n " );
7487
+ continue ;
7488
+ }
7489
+
7490
+ InstructionCost Cost = cost (*P, VF);
7491
+ VectorizationFactor CurrentFactor (VF, Cost, ScalarCost);
7492
+ if (isMoreProfitable (CurrentFactor, BestFactor)) {
7493
+ BestFactor = CurrentFactor;
7494
+ BestPlan = &*P;
7495
+ }
7496
+ }
7497
+ }
7498
+ BestPlan->setVF (BestFactor.Width );
7499
+ return *BestPlan;
7500
+ }
7501
+
7302
7502
VPlan &LoopVectorizationPlanner::getBestPlanFor (ElementCount VF) const {
7303
7503
assert (count_if (VPlans,
7304
7504
[VF](const VPlanPtr &Plan) { return Plan->hasVF (VF); }) ==
@@ -10169,8 +10369,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10169
10369
VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10170
10370
PSI, Checks);
10171
10371
10172
- VPlan &BestPlan = LVP.getBestPlanFor (VF.Width );
10173
- LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10372
+ VPlan &BestPlan = LVP.getBestPlan ();
10373
+ assert (size (BestPlan.vectorFactors ()) == 1 &&
10374
+ " Plan should have a single VF" );
10375
+ ElementCount Width = *BestPlan.vectorFactors ().begin ();
10376
+ LLVM_DEBUG (dbgs () << " VF picked by VPlan cost model: " << Width
10377
+ << " \n " );
10378
+ assert (VF.Width == Width &&
10379
+ " VPlan cost model and legacy cost model disagreed" );
10380
+ LVP.executePlan (Width, IC, BestPlan, LB, DT, false );
10174
10381
++LoopsVectorized;
10175
10382
10176
10383
// Add metadata to disable runtime unrolling a scalar loop when there
0 commit comments