Skip to content

Commit 04f0aa7

Browse files
committed
[AArch64] Improve codegen of vectorised early exit loops
Once PR #112138 lands we are able to start vectorising more loops that have uncountable early exits. The typical loop structure looks like this: vector.body: ... %pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat ... %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred) %iv.cmp = icmp eq i64 %index.next, 4 %exit.cond = or i1 %or.reduc, %iv.cmp br i1 %exit.cond, label %middle.split, label %vector.body middle.split: br i1 %or.reduc, label %found, label %notfound found: ret i64 1 notfound: ret i64 0 The problem with this is that %or.reduc is kept live after the loop, and since this is a boolean it typically requires making a copy of the condition code register. For AArch64 this requires an additional cset instruction, which is quite expensive for a typical find loop that only contains 6 or 7 instructions. This patch attempts to improve the codegen by sinking the reduction out of the loop to the location of it's user. It's a lot cheaper to keep the predicate alive if the type is legal and has lots of registers for it. There is a potential downside in that a little more work is required after the loop, but I believe this is worth it since we are likely to spend most of our time in the loop.
1 parent 3e8e3c6 commit 04f0aa7

File tree

3 files changed

+41
-14
lines changed

3 files changed

+41
-14
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5290,18 +5290,41 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
52905290
}
52915291
}
52925292

5293-
// Sink vscales closer to uses for better isel
5293+
auto ShouldSinkCondition = [](Value *Cond) -> bool {
5294+
auto *II = dyn_cast<IntrinsicInst>(Cond);
5295+
return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
5296+
isa<ScalableVectorType>(II->getOperand(0)->getType());
5297+
};
5298+
52945299
switch (I->getOpcode()) {
52955300
case Instruction::GetElementPtr:
52965301
case Instruction::Add:
52975302
case Instruction::Sub:
5303+
// Sink vscales closer to uses for better isel
52985304
for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
52995305
if (shouldSinkVScale(I->getOperand(Op), Ops)) {
53005306
Ops.push_back(&I->getOperandUse(Op));
53015307
return true;
53025308
}
53035309
}
53045310
break;
5311+
case Instruction::Select: {
5312+
if (!ShouldSinkCondition(I->getOperand(0)))
5313+
return false;
5314+
5315+
Ops.push_back(&I->getOperandUse(0));
5316+
return true;
5317+
}
5318+
case Instruction::Br: {
5319+
if (cast<BranchInst>(I)->isUnconditional())
5320+
return false;
5321+
5322+
if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
5323+
return false;
5324+
5325+
Ops.push_back(&I->getOperandUse(0));
5326+
return true;
5327+
}
53055328
default:
53065329
break;
53075330
}

llvm/test/CodeGen/AArch64/reduce-or-opt.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -93,24 +93,24 @@ notfound:
9393
define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
9494
; CHECK-LABEL: select_or_reduce_nxv2i1:
9595
; CHECK: // %bb.0: // %entry
96-
; CHECK-NEXT: cntd x9
96+
; CHECK-NEXT: cntd x8
9797
; CHECK-NEXT: ptrue p0.d
98-
; CHECK-NEXT: mov x10, xzr
99-
; CHECK-NEXT: neg x8, x9
100-
; CHECK-NEXT: add x11, x8, #4
98+
; CHECK-NEXT: mov x9, xzr
99+
; CHECK-NEXT: neg x10, x8
100+
; CHECK-NEXT: add x10, x10, #4
101101
; CHECK-NEXT: .LBB2_1: // %vector.body
102102
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
103-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
103+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
104104
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
105-
; CHECK-NEXT: cset w8, ne
106105
; CHECK-NEXT: b.ne .LBB2_3
107106
; CHECK-NEXT: // %bb.2: // %vector.body
108107
; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1
109-
; CHECK-NEXT: cmp x11, x10
110-
; CHECK-NEXT: add x10, x10, x9
108+
; CHECK-NEXT: cmp x10, x9
109+
; CHECK-NEXT: add x9, x9, x8
111110
; CHECK-NEXT: b.ne .LBB2_1
112111
; CHECK-NEXT: .LBB2_3: // %middle.split
113-
; CHECK-NEXT: mov x0, x8
112+
; CHECK-NEXT: ptest p0, p1.b
113+
; CHECK-NEXT: cset w0, ne
114114
; CHECK-NEXT: ret
115115
entry:
116116
%vscale = tail call i64 @llvm.vscale.i64()
@@ -145,15 +145,15 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef
145145
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
146146
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
147147
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
148-
; CHECK-NEXT: cset w11, ne
149148
; CHECK-NEXT: b.ne .LBB3_3
150149
; CHECK-NEXT: // %bb.2: // %vector.body
151150
; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
152151
; CHECK-NEXT: cmp x10, x9
153152
; CHECK-NEXT: add x9, x9, x8
154153
; CHECK-NEXT: b.ne .LBB3_1
155154
; CHECK-NEXT: .LBB3_3: // %middle.split
156-
; CHECK-NEXT: tbz w11, #0, .LBB3_5
155+
; CHECK-NEXT: ptest p0, p1.b
156+
; CHECK-NEXT: b.eq .LBB3_5
157157
; CHECK-NEXT: // %bb.4: // %found
158158
; CHECK-NEXT: mov w8, #56 // =0x38
159159
; CHECK-NEXT: mov w0, #1 // =0x1

llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
105105
; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
106106
; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
107107
; CHECK: [[MIDDLE_SPLIT]]:
108-
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0
108+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
109+
; CHECK-NEXT: [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
110+
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[TMP3]], i64 1, i64 0
109111
; CHECK-NEXT: ret i64 [[SEL]]
110112
;
111113
entry:
@@ -147,7 +149,9 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef
147149
; CHECK-NEXT: [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
148150
; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
149151
; CHECK: [[MIDDLE_SPLIT]]:
150-
; CHECK-NEXT: br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
152+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
153+
; CHECK-NEXT: [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
154+
; CHECK-NEXT: br i1 [[TMP3]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
151155
; CHECK: [[FOUND]]:
152156
; CHECK-NEXT: store i64 56, ptr [[P]], align 8
153157
; CHECK-NEXT: ret i64 1

0 commit comments

Comments
 (0)