Skip to content

Commit 337936a

Browse files
authored
[LV] Ignore some costs when loop gets fully unrolled (#106699)
When VF has a fixed width and equals the number of iterations, and we are not tail folding by masking, comparison instruction and induction operation will be DCEed later. Ignoring the costs of these instructions improves the cost model.
1 parent 73adf26 commit 337936a

File tree

2 files changed

+45
-14
lines changed

2 files changed

+45
-14
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2682,6 +2682,25 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
26822682
return I->second;
26832683
}
26842684

2685+
/// Knowing that loop \p L executes a single vector iteration, add instructions
2686+
/// that will get simplified and thus should not have any cost to \p
2687+
/// InstsToIgnore.
2688+
static void addFullyUnrolledInstructionsToIgnore(
2689+
Loop *L, const LoopVectorizationLegality::InductionList &IL,
2690+
SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2691+
auto *Cmp = L->getLatchCmpInst();
2692+
if (Cmp)
2693+
InstsToIgnore.insert(Cmp);
2694+
for (const auto &[IV, IndDesc] : IL) {
2695+
// Get next iteration value of the induction variable.
2696+
Instruction *IVInst =
2697+
cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2698+
if (all_of(IVInst->users(),
2699+
[&](const User *U) { return U == IV || U == Cmp; }))
2700+
InstsToIgnore.insert(IVInst);
2701+
}
2702+
}
2703+
26852704
void InnerLoopVectorizer::createInductionResumeVPValues(
26862705
const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
26872706
SmallPtrSetImpl<PHINode *> *IVSubset) {
@@ -5592,14 +5611,23 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55925611
InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
55935612
InstructionCost Cost;
55945613

5614+
// If the vector loop gets executed exactly once with the given VF, ignore the
5615+
// costs of comparison and induction instructions, as they'll get simplified
5616+
// away.
5617+
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5618+
auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5619+
if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5620+
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
5621+
ValuesToIgnoreForVF);
5622+
55955623
// For each block.
55965624
for (BasicBlock *BB : TheLoop->blocks()) {
55975625
InstructionCost BlockCost;
55985626

55995627
// For each instruction in the old loop.
56005628
for (Instruction &I : BB->instructionsWithoutDebug()) {
56015629
// Skip ignored values.
5602-
if (ValuesToIgnore.count(&I) ||
5630+
if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
56035631
(VF.isVector() && VecValuesToIgnore.count(&I)))
56045632
continue;
56055633

@@ -7281,6 +7309,17 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
72817309
continue;
72827310
IVInsts.push_back(CI);
72837311
}
7312+
7313+
// If the vector loop gets executed exactly once with the given VF, ignore
7314+
// the costs of comparison and induction instructions, as they'll get
7315+
// simplified away.
7316+
// TODO: Remove this code after stepping away from the legacy cost model and
7317+
// adding code to simplify VPlans before calculating their costs.
7318+
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7319+
if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7320+
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
7321+
CostCtx.SkipCostComputation);
7322+
72847323
for (Instruction *IVInst : IVInsts) {
72857324
if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
72867325
continue;

llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
1212
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
1313
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
1414
; CHECK: Cost for VF 8: 26
15-
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
1615
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
17-
; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
1816
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
19-
; CHECK: Cost for VF 16: 50
20-
; CHECK: LV: Selecting VF: vscale x 2
17+
; CHECK: Cost for VF 16: 48
18+
; CHECK: LV: Selecting VF: 16
2119
entry:
2220
br label %for.body
2321

@@ -50,9 +48,8 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
5048
; CHECK: Cost for VF 8: 26
5149
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
5250
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
53-
; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
5451
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
55-
; CHECK: Cost for VF 16: 50
52+
; CHECK: Cost for VF 16: 49
5653
; CHECK: LV: Selecting VF: vscale x 2
5754
entry:
5855
br label %for.body
@@ -86,13 +83,10 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
8683
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
8784
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
8885
; CHECK: Cost for VF 8: 27
89-
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
9086
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
91-
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1
9287
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
93-
; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
9488
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
95-
; CHECK: Cost for VF 16: 51
89+
; CHECK: Cost for VF 16: 48
9690
; CHECK: LV: Selecting VF: 16
9791
entry:
9892
br label %for.body
@@ -125,11 +119,9 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef
125119
; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16
126120
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
127121
; CHECK: Cost for VF 8: 12
128-
; CHECK-NEXT: Cost of 8 for VF 16: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
129122
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
130-
; CHECK-NEXT: Cost of 8 for VF 16: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16
131123
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
132-
; CHECK: Cost for VF 16: 20
124+
; CHECK: Cost for VF 16: 4
133125
; CHECK: LV: Selecting VF: 16
134126
entry:
135127
br label %for.body

0 commit comments

Comments
 (0)