Skip to content

Commit 346185c

Browse files
authored
[AArch64] Improve codegen of vectorised early exit loops (#119534)
Once PR #112138 lands we are able to start vectorising more loops that have uncountable early exits. The typical loop structure looks like this: vector.body: ... %pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat ... %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred) %iv.cmp = icmp eq i64 %index.next, 4 %exit.cond = or i1 %or.reduc, %iv.cmp br i1 %exit.cond, label %middle.split, label %vector.body middle.split: br i1 %or.reduc, label %found, label %notfound found: ret i64 1 notfound: ret i64 0 The problem with this is that %or.reduc is kept live after the loop, and since this is a boolean it typically requires making a copy of the condition code register. For AArch64 this requires an additional cset instruction, which is quite expensive for a typical find loop that only contains 6 or 7 instructions. This patch attempts to improve the codegen by sinking the reduction out of the loop to the location of it's user. It's a lot cheaper to keep the predicate alive if the type is legal and has lots of registers for it. There is a potential downside in that a little more work is required after the loop, but I believe this is worth it since we are likely to spend most of our time in the loop.
1 parent 8f17c90 commit 346185c

File tree

3 files changed

+406
-1
lines changed

3 files changed

+406
-1
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5290,18 +5290,41 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
52905290
}
52915291
}
52925292

5293-
// Sink vscales closer to uses for better isel
5293+
auto ShouldSinkCondition = [](Value *Cond) -> bool {
5294+
auto *II = dyn_cast<IntrinsicInst>(Cond);
5295+
return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
5296+
isa<ScalableVectorType>(II->getOperand(0)->getType());
5297+
};
5298+
52945299
switch (I->getOpcode()) {
52955300
case Instruction::GetElementPtr:
52965301
case Instruction::Add:
52975302
case Instruction::Sub:
5303+
// Sink vscales closer to uses for better isel
52985304
for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
52995305
if (shouldSinkVScale(I->getOperand(Op), Ops)) {
53005306
Ops.push_back(&I->getOperandUse(Op));
53015307
return true;
53025308
}
53035309
}
53045310
break;
5311+
case Instruction::Select: {
5312+
if (!ShouldSinkCondition(I->getOperand(0)))
5313+
return false;
5314+
5315+
Ops.push_back(&I->getOperandUse(0));
5316+
return true;
5317+
}
5318+
case Instruction::Br: {
5319+
if (cast<BranchInst>(I)->isUnconditional())
5320+
return false;
5321+
5322+
if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
5323+
return false;
5324+
5325+
Ops.push_back(&I->getOperandUse(0));
5326+
return true;
5327+
}
53055328
default:
53065329
break;
53075330
}
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
3+
4+
define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
5+
; CHECK-LABEL: select_or_reduce_v2i1:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: mov x8, xzr
8+
; CHECK-NEXT: .LBB0_1: // %vector.body
9+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
10+
; CHECK-NEXT: ldr q0, [x0, x8]
11+
; CHECK-NEXT: cmeq v0.2d, v0.2d, #0
12+
; CHECK-NEXT: umaxv s0, v0.4s
13+
; CHECK-NEXT: fmov w9, s0
14+
; CHECK-NEXT: tbnz w9, #0, .LBB0_3
15+
; CHECK-NEXT: // %bb.2: // %vector.body
16+
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
17+
; CHECK-NEXT: cmp x8, #16
18+
; CHECK-NEXT: add x8, x8, #16
19+
; CHECK-NEXT: b.ne .LBB0_1
20+
; CHECK-NEXT: .LBB0_3: // %middle.split
21+
; CHECK-NEXT: and x0, x9, #0x1
22+
; CHECK-NEXT: ret
23+
entry:
24+
br label %vector.body
25+
26+
vector.body:
27+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
28+
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
29+
%wide.load = load <2 x ptr>, ptr %arrayidx, align 8
30+
%cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
31+
%index.next = add nuw i64 %index, 2
32+
%or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
33+
%iv.cmp = icmp eq i64 %index.next, 4
34+
%exit.cond = or i1 %or.reduc, %iv.cmp
35+
br i1 %exit.cond, label %middle.split, label %vector.body
36+
37+
middle.split:
38+
%sel = select i1 %or.reduc, i64 1, i64 0
39+
ret i64 %sel
40+
}
41+
42+
define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
43+
; CHECK-LABEL: br_or_reduce_v2i1:
44+
; CHECK: // %bb.0: // %entry
45+
; CHECK-NEXT: mov x8, xzr
46+
; CHECK-NEXT: .LBB1_1: // %vector.body
47+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
48+
; CHECK-NEXT: ldr q0, [x0, x8]
49+
; CHECK-NEXT: cmeq v0.2d, v0.2d, #0
50+
; CHECK-NEXT: umaxv s0, v0.4s
51+
; CHECK-NEXT: fmov w9, s0
52+
; CHECK-NEXT: tbnz w9, #0, .LBB1_3
53+
; CHECK-NEXT: // %bb.2: // %vector.body
54+
; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1
55+
; CHECK-NEXT: cmp x8, #16
56+
; CHECK-NEXT: add x8, x8, #16
57+
; CHECK-NEXT: b.ne .LBB1_1
58+
; CHECK-NEXT: .LBB1_3: // %middle.split
59+
; CHECK-NEXT: tbz w9, #0, .LBB1_5
60+
; CHECK-NEXT: // %bb.4: // %found
61+
; CHECK-NEXT: mov w8, #56 // =0x38
62+
; CHECK-NEXT: mov w0, #1 // =0x1
63+
; CHECK-NEXT: str x8, [x1]
64+
; CHECK-NEXT: ret
65+
; CHECK-NEXT: .LBB1_5:
66+
; CHECK-NEXT: mov x0, xzr
67+
; CHECK-NEXT: ret
68+
entry:
69+
br label %vector.body
70+
71+
vector.body:
72+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
73+
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
74+
%wide.load = load <2 x ptr>, ptr %arrayidx, align 8
75+
%cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
76+
%index.next = add nuw i64 %index, 2
77+
%or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
78+
%iv.cmp = icmp eq i64 %index.next, 4
79+
%exit.cond = or i1 %or.reduc, %iv.cmp
80+
br i1 %exit.cond, label %middle.split, label %vector.body
81+
82+
middle.split:
83+
br i1 %or.reduc, label %found, label %notfound
84+
85+
found:
86+
store i64 56, ptr %p, align 8
87+
ret i64 1
88+
89+
notfound:
90+
ret i64 0
91+
}
92+
93+
define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
94+
; CHECK-LABEL: select_or_reduce_nxv2i1:
95+
; CHECK: // %bb.0: // %entry
96+
; CHECK-NEXT: cntd x8
97+
; CHECK-NEXT: ptrue p0.d
98+
; CHECK-NEXT: mov x9, xzr
99+
; CHECK-NEXT: neg x10, x8
100+
; CHECK-NEXT: add x10, x10, #4
101+
; CHECK-NEXT: .LBB2_1: // %vector.body
102+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
103+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
104+
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
105+
; CHECK-NEXT: b.ne .LBB2_3
106+
; CHECK-NEXT: // %bb.2: // %vector.body
107+
; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1
108+
; CHECK-NEXT: cmp x10, x9
109+
; CHECK-NEXT: add x9, x9, x8
110+
; CHECK-NEXT: b.ne .LBB2_1
111+
; CHECK-NEXT: .LBB2_3: // %middle.split
112+
; CHECK-NEXT: ptest p0, p1.b
113+
; CHECK-NEXT: cset w0, ne
114+
; CHECK-NEXT: ret
115+
entry:
116+
%vscale = tail call i64 @llvm.vscale.i64()
117+
%vf = shl nuw nsw i64 %vscale, 1
118+
br label %vector.body
119+
120+
vector.body:
121+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
122+
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
123+
%wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
124+
%cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
125+
%index.next = add nuw i64 %index, %vf
126+
%or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
127+
%iv.cmp = icmp eq i64 %index.next, 4
128+
%exit.cond = or i1 %or.reduc, %iv.cmp
129+
br i1 %exit.cond, label %middle.split, label %vector.body
130+
131+
middle.split:
132+
%sel = select i1 %or.reduc, i64 1, i64 0
133+
ret i64 %sel
134+
}
135+
136+
define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
137+
; CHECK-LABEL: br_or_reduce_nxv2i1:
138+
; CHECK: // %bb.0: // %entry
139+
; CHECK-NEXT: cntd x8
140+
; CHECK-NEXT: ptrue p0.d
141+
; CHECK-NEXT: mov x9, xzr
142+
; CHECK-NEXT: neg x10, x8
143+
; CHECK-NEXT: add x10, x10, #4
144+
; CHECK-NEXT: .LBB3_1: // %vector.body
145+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
146+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
147+
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
148+
; CHECK-NEXT: b.ne .LBB3_3
149+
; CHECK-NEXT: // %bb.2: // %vector.body
150+
; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
151+
; CHECK-NEXT: cmp x10, x9
152+
; CHECK-NEXT: add x9, x9, x8
153+
; CHECK-NEXT: b.ne .LBB3_1
154+
; CHECK-NEXT: .LBB3_3: // %middle.split
155+
; CHECK-NEXT: ptest p0, p1.b
156+
; CHECK-NEXT: b.eq .LBB3_5
157+
; CHECK-NEXT: // %bb.4: // %found
158+
; CHECK-NEXT: mov w8, #56 // =0x38
159+
; CHECK-NEXT: mov w0, #1 // =0x1
160+
; CHECK-NEXT: str x8, [x1]
161+
; CHECK-NEXT: ret
162+
; CHECK-NEXT: .LBB3_5:
163+
; CHECK-NEXT: mov x0, xzr
164+
; CHECK-NEXT: ret
165+
entry:
166+
%vscale = tail call i64 @llvm.vscale.i64()
167+
%vf = shl nuw nsw i64 %vscale, 1
168+
br label %vector.body
169+
170+
vector.body:
171+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
172+
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
173+
%wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
174+
%cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
175+
%index.next = add nuw i64 %index, %vf
176+
%or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
177+
%iv.cmp = icmp eq i64 %index.next, 4
178+
%exit.cond = or i1 %or.reduc, %iv.cmp
179+
br i1 %exit.cond, label %middle.split, label %vector.body
180+
181+
middle.split:
182+
br i1 %or.reduc, label %found, label %notfound
183+
184+
found:
185+
store i64 56, ptr %p, align 8
186+
ret i64 1
187+
188+
notfound:
189+
ret i64 0
190+
}
191+
192+
declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
193+
declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)

0 commit comments

Comments
 (0)