Skip to content

Commit 0041f31

Browse files
authored
Merge branch 'main' into dynamicextent_keep_live
2 parents 7f7dfc8 + 26e42c7 commit 0041f31

File tree

4 files changed

+149
-22
lines changed

4 files changed

+149
-22
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
7777
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
7878
cl::desc("The number of instructions to search for a redundant dmb"));
7979

80+
static cl::opt<int> Aarch64ForceUnrollThreshold(
81+
"aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82+
cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83+
8084
namespace {
8185
class TailFoldingOption {
8286
// These bitfields will only ever be set to something non-zero in operator=,
@@ -4163,12 +4167,15 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
41634167

41644168
std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
41654169
Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4166-
TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
4170+
TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
41674171
std::function<InstructionCost(Type *)> InstCost) const {
41684172
if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
41694173
return std::nullopt;
41704174
if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
41714175
return std::nullopt;
4176+
if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4177+
ST->isNonStreamingSVEorSME2Available())
4178+
return std::nullopt;
41724179

41734180
Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
41744181
InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
@@ -4210,6 +4217,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
42104217
ISD == ISD::FDIV || ISD == ISD::FREM)
42114218
if (auto PromotedCost = getFP16BF16PromoteCost(
42124219
Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4220+
// There is not native support for fdiv/frem even with +sve-b16b16.
4221+
/*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
42134222
[&](Type *PromotedTy) {
42144223
return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
42154224
Op1Info, Op2Info);
@@ -4624,7 +4633,8 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
46244633
if (Opcode == Instruction::FCmp) {
46254634
if (auto PromotedCost = getFP16BF16PromoteCost(
46264635
ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4627-
[&](Type *PromotedTy) {
4636+
// TODO: Consider costing SVE FCMPs.
4637+
/*CanUseSVE=*/false, [&](Type *PromotedTy) {
46284638
InstructionCost Cost =
46294639
getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
46304640
CostKind, Op1Info, Op2Info);
@@ -5250,6 +5260,7 @@ void AArch64TTIImpl::getUnrollingPreferences(
52505260
// inlining. Don't unroll auto-vectorized loops either, though do allow
52515261
// unrolling of the scalar remainder.
52525262
bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5263+
InstructionCost Cost = 0;
52535264
for (auto *BB : L->getBlocks()) {
52545265
for (auto &I : *BB) {
52555266
// Both auto-vectorized loops and the scalar remainder have the
@@ -5264,6 +5275,10 @@ void AArch64TTIImpl::getUnrollingPreferences(
52645275
continue;
52655276
return;
52665277
}
5278+
5279+
SmallVector<const Value *, 4> Operands(I.operand_values());
5280+
Cost += getInstructionCost(&I, Operands,
5281+
TargetTransformInfo::TCK_SizeAndLatency);
52675282
}
52685283
}
52695284

@@ -5310,6 +5325,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
53105325
UP.UnrollAndJam = true;
53115326
UP.UnrollAndJamInnerLoopThreshold = 60;
53125327
}
5328+
5329+
// Force unrolling small loops can be very useful because of the branch
5330+
// taken cost of the backedge.
5331+
if (Cost < Aarch64ForceUnrollThreshold)
5332+
UP.Force = true;
53135333
}
53145334

53155335
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -456,11 +456,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
456456

457457
/// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the
458458
/// architecture features are not present.
459-
std::optional<InstructionCost>
460-
getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind,
461-
TTI::OperandValueInfo Op1Info,
462-
TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
463-
std::function<InstructionCost(Type *)> InstCost) const;
459+
std::optional<InstructionCost> getFP16BF16PromoteCost(
460+
Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
461+
TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
462+
std::function<InstructionCost(Type *)> InstCost) const;
464463

465464
InstructionCost
466465
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,

llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,17 @@ define void @fadd() {
3333
}
3434

3535
define void @fadd_bf16() {
36-
; CHECK-LABEL: 'fadd_bf16'
37-
; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
38-
; CHECK-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
39-
; CHECK-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
40-
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
36+
; CHECK-BASE-LABEL: 'fadd_bf16'
37+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
38+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
39+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
40+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
41+
;
42+
; CHECK-BF16-LABEL: 'fadd_bf16'
43+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
44+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
45+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
46+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
4147
;
4248
%NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
4349
%NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
@@ -76,11 +82,17 @@ define void @fsub() {
7682
}
7783

7884
define void @fsub_bf16() {
79-
; CHECK-LABEL: 'fsub_bf16'
80-
; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
81-
; CHECK-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
82-
; CHECK-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
83-
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
85+
; CHECK-BASE-LABEL: 'fsub_bf16'
86+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
87+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
88+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
89+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
90+
;
91+
; CHECK-BF16-LABEL: 'fsub_bf16'
92+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
93+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
94+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
95+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
8496
;
8597
%NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
8698
%NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
@@ -160,11 +172,17 @@ define void @fmul() {
160172
}
161173

162174
define void @fmul_bf16() {
163-
; CHECK-LABEL: 'fmul_bf16'
164-
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
165-
; CHECK-NEXT: Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
166-
; CHECK-NEXT: Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
167-
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
175+
; CHECK-BASE-LABEL: 'fmul_bf16'
176+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
177+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
178+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
179+
; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
180+
;
181+
; CHECK-BF16-LABEL: 'fmul_bf16'
182+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
183+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
184+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
185+
; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
168186
;
169187
%NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
170188
%NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE
2+
; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE
3+
4+
; The loop has a small runtime upper bound (at most four iterations) but a
5+
; relatively expensive body. With runtime unrolling enabled, the cost model
6+
; still leaves the loop rolled. Raising the AArch64 force threshold overrides
7+
; that decision and unrolls.
8+
9+
target triple = "aarch64-unknown-linux-gnu"
10+
11+
define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) {
12+
entry:
13+
br label %loop
14+
15+
; NOFORCE-LABEL: @force_small_loop(
16+
; NOFORCE: loop:
17+
; NOFORCE: br i1 %cond, label %body, label %exit
18+
; NOFORCE: body:
19+
; NOFORCE: store i32 %mix15, ptr %ptrb, align 4
20+
; NOFORCE: latch:
21+
; NOFORCE: br i1 %cmp2, label %loop, label %exit
22+
; NOFORCE: ret void
23+
; NOFORCE-NOT: loop.1:
24+
;
25+
; FORCE-LABEL: @force_small_loop(
26+
; FORCE: loop:
27+
; FORCE: br i1 %cond, label %body, label %exit
28+
; FORCE: loop.1:
29+
; FORCE: br i1 true, label %body.1, label %exit
30+
; FORCE: body.1:
31+
; FORCE: store i32 %mix15.1, ptr %ptrb.1, align 4
32+
; FORCE: latch.1:
33+
; FORCE: br i1 %cmp2.1, label %loop, label %exit
34+
; FORCE: ret void
35+
36+
loop:
37+
%i = phi i32 [ 0, %entry ], [ %inc, %latch ]
38+
%ptra = getelementptr inbounds i32, ptr %a, i32 %i
39+
%pa = load i32, ptr %ptra, align 4
40+
%tmp0 = mul nsw i32 %pa, %pa
41+
%tmp1 = add nsw i32 %tmp0, %pa
42+
%tmp2 = shl i32 %tmp1, 1
43+
%tmp3 = ashr i32 %tmp2, 1
44+
%tmp4 = xor i32 %tmp3, %pa
45+
%tmp5 = add nsw i32 %tmp4, 7
46+
%tmp6 = mul nsw i32 %tmp5, 5
47+
%tmp7 = add nsw i32 %tmp6, %tmp4
48+
%tmp8 = mul nsw i32 %tmp7, %tmp3
49+
%tmp9 = add nsw i32 %tmp8, %tmp7
50+
%tmp10 = xor i32 %tmp9, %tmp6
51+
%tmp11 = add nsw i32 %tmp10, %tmp8
52+
%tmp12 = mul nsw i32 %tmp11, 9
53+
%tmp13 = add nsw i32 %tmp12, %tmp10
54+
%tmp14 = xor i32 %tmp13, %tmp11
55+
%cond = icmp ult i32 %i, %n
56+
br i1 %cond, label %body, label %exit
57+
58+
body:
59+
%ptrb = getelementptr inbounds i32, ptr %b, i32 %i
60+
%pb = load i32, ptr %ptrb, align 4
61+
%sum = add nsw i32 %pb, %tmp14
62+
%diff = sub nsw i32 %sum, %pa
63+
%mix1 = mul nsw i32 %diff, 3
64+
%mix2 = add nsw i32 %mix1, %tmp3
65+
%mix3 = xor i32 %mix2, %diff
66+
%mix4 = add nsw i32 %mix3, %tmp0
67+
%mix5 = mul nsw i32 %mix4, 11
68+
%mix6 = add nsw i32 %mix5, %mix2
69+
%mix7 = xor i32 %mix6, %mix5
70+
%mix8 = add nsw i32 %mix7, %mix3
71+
%mix9 = mul nsw i32 %mix8, 13
72+
%mix10 = add nsw i32 %mix9, %mix8
73+
%mix11 = xor i32 %mix10, %mix7
74+
%mix12 = add nsw i32 %mix11, %mix6
75+
%mix13 = mul nsw i32 %mix12, 17
76+
%mix14 = add nsw i32 %mix13, %mix9
77+
%mix15 = xor i32 %mix14, %mix10
78+
store i32 %mix15, ptr %ptrb, align 4
79+
br label %latch
80+
81+
latch:
82+
%inc = add nuw nsw i32 %i, 1
83+
%cmp.limit = icmp ult i32 %n, 4
84+
%upper = select i1 %cmp.limit, i32 %n, i32 4
85+
%cmp2 = icmp ult i32 %inc, %upper
86+
br i1 %cmp2, label %loop, label %exit
87+
88+
exit:
89+
ret void
90+
}

0 commit comments

Comments
 (0)