Skip to content

[AArch64] Improve codegen of vectorised early exit loops #119534

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5290,18 +5290,41 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
}
}

// Sink vscales closer to uses for better isel
auto ShouldSinkCondition = [](Value *Cond) -> bool {
auto *II = dyn_cast<IntrinsicInst>(Cond);
return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
isa<ScalableVectorType>(II->getOperand(0)->getType());
};

switch (I->getOpcode()) {
case Instruction::GetElementPtr:
case Instruction::Add:
case Instruction::Sub:
// Sink vscales closer to uses for better isel
for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
if (shouldSinkVScale(I->getOperand(Op), Ops)) {
Ops.push_back(&I->getOperandUse(Op));
return true;
}
}
break;
case Instruction::Select: {
if (!ShouldSinkCondition(I->getOperand(0)))
return false;

Ops.push_back(&I->getOperandUse(0));
return true;
}
case Instruction::Br: {
if (cast<BranchInst>(I)->isUnconditional())
return false;

if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
return false;

Ops.push_back(&I->getOperandUse(0));
return true;
}
default:
break;
}
Expand Down
193 changes: 193 additions & 0 deletions llvm/test/CodeGen/AArch64/reduce-or-opt.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s

define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
; CHECK-LABEL: select_or_reduce_v2i1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q0, [x0, x8]
; CHECK-NEXT: cmeq v0.2d, v0.2d, #0
; CHECK-NEXT: umaxv s0, v0.4s
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: tbnz w9, #0, .LBB0_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: cmp x8, #16
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: .LBB0_3: // %middle.split
; CHECK-NEXT: and x0, x9, #0x1
; CHECK-NEXT: ret
entry:
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
%wide.load = load <2 x ptr>, ptr %arrayidx, align 8
%cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
%index.next = add nuw i64 %index, 2
%or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
%iv.cmp = icmp eq i64 %index.next, 4
%exit.cond = or i1 %or.reduc, %iv.cmp
br i1 %exit.cond, label %middle.split, label %vector.body

middle.split:
%sel = select i1 %or.reduc, i64 1, i64 0
ret i64 %sel
}

define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
; CHECK-LABEL: br_or_reduce_v2i1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr q0, [x0, x8]
; CHECK-NEXT: cmeq v0.2d, v0.2d, #0
; CHECK-NEXT: umaxv s0, v0.4s
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: tbnz w9, #0, .LBB1_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: cmp x8, #16
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: b.ne .LBB1_1
; CHECK-NEXT: .LBB1_3: // %middle.split
; CHECK-NEXT: tbz w9, #0, .LBB1_5
; CHECK-NEXT: // %bb.4: // %found
; CHECK-NEXT: mov w8, #56 // =0x38
; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: str x8, [x1]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_5:
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
entry:
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
%wide.load = load <2 x ptr>, ptr %arrayidx, align 8
%cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
%index.next = add nuw i64 %index, 2
%or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
%iv.cmp = icmp eq i64 %index.next, 4
%exit.cond = or i1 %or.reduc, %iv.cmp
br i1 %exit.cond, label %middle.split, label %vector.body

middle.split:
br i1 %or.reduc, label %found, label %notfound

found:
store i64 56, ptr %p, align 8
ret i64 1

notfound:
ret i64 0
}

define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
; CHECK-LABEL: select_or_reduce_nxv2i1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: neg x10, x8
; CHECK-NEXT: add x10, x10, #4
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: b.ne .LBB2_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1
; CHECK-NEXT: cmp x10, x9
; CHECK-NEXT: add x9, x9, x8
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: .LBB2_3: // %middle.split
; CHECK-NEXT: ptest p0, p1.b
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
entry:
%vscale = tail call i64 @llvm.vscale.i64()
%vf = shl nuw nsw i64 %vscale, 1
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
%wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
%cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
%index.next = add nuw i64 %index, %vf
%or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
%iv.cmp = icmp eq i64 %index.next, 4
%exit.cond = or i1 %or.reduc, %iv.cmp
br i1 %exit.cond, label %middle.split, label %vector.body

middle.split:
%sel = select i1 %or.reduc, i64 1, i64 0
ret i64 %sel
}

define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
; CHECK-LABEL: br_or_reduce_nxv2i1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: neg x10, x8
; CHECK-NEXT: add x10, x10, #4
; CHECK-NEXT: .LBB3_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: b.ne .LBB3_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
; CHECK-NEXT: cmp x10, x9
; CHECK-NEXT: add x9, x9, x8
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: .LBB3_3: // %middle.split
; CHECK-NEXT: ptest p0, p1.b
; CHECK-NEXT: b.eq .LBB3_5
; CHECK-NEXT: // %bb.4: // %found
; CHECK-NEXT: mov w8, #56 // =0x38
; CHECK-NEXT: mov w0, #1 // =0x1
; CHECK-NEXT: str x8, [x1]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB3_5:
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
entry:
%vscale = tail call i64 @llvm.vscale.i64()
%vf = shl nuw nsw i64 %vscale, 1
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
%wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
%cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
%index.next = add nuw i64 %index, %vf
%or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
%iv.cmp = icmp eq i64 %index.next, 4
%exit.cond = or i1 %or.reduc, %iv.cmp
br i1 %exit.cond, label %middle.split, label %vector.body

middle.split:
br i1 %or.reduc, label %found, label %notfound

found:
store i64 56, ptr %p, align 8
ret i64 1

notfound:
ret i64 0
}

declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
Loading
Loading