Skip to content

Commit b7b0071

Browse files
author
Dinar Temirbulatov
authored
[AArch64][SVE] Improve code quality of vector unsigned/signed add reductions. (#97339)
For SVE we don't have to zero extend and sum part of the result before issuing UADDV instruction. Also this change allows to handle bigger than a legal vector type more efficiently and lower a fixed-length vector type to SVE's UADDV where appropriate.
1 parent 6235698 commit b7b0071

7 files changed

+330
-78
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17523,6 +17523,71 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
1752317523
return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
1752417524
}
1752517525

17526+
// Turn [sign|zero]_extend(vecreduce_add()) into SVE's SADDV|UADDV
17527+
// instructions.
17528+
static SDValue
17529+
performVecReduceAddExtCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17530+
const AArch64TargetLowering &TLI) {
17531+
if (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
17532+
N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND)
17533+
return SDValue();
17534+
bool IsSigned = N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND;
17535+
17536+
SelectionDAG &DAG = DCI.DAG;
17537+
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17538+
SDValue VecOp = N->getOperand(0).getOperand(0);
17539+
EVT VecOpVT = VecOp.getValueType();
17540+
SDLoc DL(N);
17541+
17542+
// Split the input vectors if not legal, e.g.
17543+
// i32 (vecreduce_add (zext nxv32i8 %op to nxv32i32))
17544+
// ->
17545+
// i32 (add
17546+
// (i32 vecreduce_add (zext nxv16i8 %op.lo to nxv16i32)),
17547+
// (i32 vecreduce_add (zext nxv16i8 %op.hi to nxv16i32)))
17548+
if (TLI.getTypeAction(*DAG.getContext(), VecOpVT) ==
17549+
TargetLowering::TypeSplitVector) {
17550+
SDValue Lo, Hi;
17551+
std::tie(Lo, Hi) = DAG.SplitVector(VecOp, DL);
17552+
unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
17553+
EVT HalfVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
17554+
*DAG.getContext());
17555+
Lo = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0),
17556+
DAG.getNode(ExtOpc, DL, HalfVT, Lo));
17557+
Hi = DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0),
17558+
DAG.getNode(ExtOpc, DL, HalfVT, Hi));
17559+
return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Lo, Hi);
17560+
}
17561+
17562+
if (!TLI.isTypeLegal(VecOpVT))
17563+
return SDValue();
17564+
17565+
if (VecOpVT.isFixedLengthVector() &&
17566+
!TLI.useSVEForFixedLengthVectorVT(VecOpVT, !Subtarget.isNeonAvailable()))
17567+
return SDValue();
17568+
17569+
// The input type is legal so map VECREDUCE_ADD to UADDV/SADDV, e.g.
17570+
// i32 (vecreduce_add (zext nxv16i8 %op to nxv16i32))
17571+
// ->
17572+
// i32 (UADDV nxv16i8:%op)
17573+
EVT ElemType = N->getValueType(0);
17574+
SDValue Pg = getPredicateForVector(DAG, DL, VecOpVT);
17575+
if (VecOpVT.isFixedLengthVector()) {
17576+
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VecOpVT);
17577+
VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
17578+
}
17579+
SDValue Res =
17580+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
17581+
DAG.getConstant(IsSigned ? Intrinsic::aarch64_sve_saddv
17582+
: Intrinsic::aarch64_sve_uaddv,
17583+
DL, MVT::i64),
17584+
Pg, VecOp);
17585+
if (ElemType != MVT::i64)
17586+
Res = DAG.getAnyExtOrTrunc(Res, DL, ElemType);
17587+
17588+
return Res;
17589+
}
17590+
1752617591
// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
1752717592
// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
1752817593
// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
@@ -25208,8 +25273,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2520825273
return performInsertVectorEltCombine(N, DCI);
2520925274
case ISD::EXTRACT_VECTOR_ELT:
2521025275
return performExtractVectorEltCombine(N, DCI, Subtarget);
25211-
case ISD::VECREDUCE_ADD:
25212-
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
25276+
case ISD::VECREDUCE_ADD: {
25277+
if (SDValue Val = performVecReduceAddCombine(N, DCI.DAG, Subtarget))
25278+
return Val;
25279+
return performVecReduceAddExtCombine(N, DCI, *this);
25280+
}
2521325281
case AArch64ISD::UADDV:
2521425282
return performUADDVCombine(N, DAG);
2521525283
case AArch64ISD::SMULL:

llvm/test/CodeGen/AArch64/double_reduct.ll

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,10 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
145145
define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
146146
; CHECK-LABEL: add_ext_v32i16:
147147
; CHECK: // %bb.0:
148-
; CHECK-NEXT: uaddl2 v3.8h, v0.16b, v1.16b
149-
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
150-
; CHECK-NEXT: add v0.8h, v0.8h, v3.8h
151-
; CHECK-NEXT: uadalp v0.8h, v2.16b
152-
; CHECK-NEXT: addv h0, v0.8h
148+
; CHECK-NEXT: uaddlp v1.8h, v1.16b
149+
; CHECK-NEXT: uadalp v1.8h, v0.16b
150+
; CHECK-NEXT: uadalp v1.8h, v2.16b
151+
; CHECK-NEXT: addv h0, v1.8h
153152
; CHECK-NEXT: fmov w0, s0
154153
; CHECK-NEXT: ret
155154
%ae = zext <32 x i8> %a to <32 x i16>

llvm/test/CodeGen/AArch64/sve-doublereduct.ll

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,12 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
103103
define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
104104
; CHECK-LABEL: add_ext_i16:
105105
; CHECK: // %bb.0:
106-
; CHECK-NEXT: uunpkhi z2.h, z0.b
107-
; CHECK-NEXT: uunpklo z0.h, z0.b
108-
; CHECK-NEXT: uunpkhi z3.h, z1.b
109-
; CHECK-NEXT: uunpklo z1.h, z1.b
110-
; CHECK-NEXT: ptrue p0.h
111-
; CHECK-NEXT: add z0.h, z0.h, z2.h
112-
; CHECK-NEXT: add z1.h, z1.h, z3.h
113-
; CHECK-NEXT: add z0.h, z0.h, z1.h
114-
; CHECK-NEXT: uaddv d0, p0, z0.h
115-
; CHECK-NEXT: fmov x0, d0
116-
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
106+
; CHECK-NEXT: ptrue p0.b
107+
; CHECK-NEXT: uaddv d0, p0, z0.b
108+
; CHECK-NEXT: uaddv d1, p0, z1.b
109+
; CHECK-NEXT: fmov w8, s0
110+
; CHECK-NEXT: fmov w9, s1
111+
; CHECK-NEXT: add w0, w8, w9
117112
; CHECK-NEXT: ret
118113
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
119114
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -126,21 +121,15 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
126121
define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
127122
; CHECK-LABEL: add_ext_v32i16:
128123
; CHECK: // %bb.0:
129-
; CHECK-NEXT: uunpklo z3.h, z1.b
130-
; CHECK-NEXT: uunpklo z4.h, z0.b
131-
; CHECK-NEXT: uunpkhi z1.h, z1.b
132-
; CHECK-NEXT: uunpkhi z0.h, z0.b
133-
; CHECK-NEXT: uunpkhi z5.h, z2.b
134-
; CHECK-NEXT: uunpklo z2.h, z2.b
135-
; CHECK-NEXT: ptrue p0.h
136-
; CHECK-NEXT: add z0.h, z0.h, z1.h
137-
; CHECK-NEXT: add z1.h, z4.h, z3.h
138-
; CHECK-NEXT: add z0.h, z1.h, z0.h
139-
; CHECK-NEXT: add z1.h, z2.h, z5.h
140-
; CHECK-NEXT: add z0.h, z0.h, z1.h
141-
; CHECK-NEXT: uaddv d0, p0, z0.h
142-
; CHECK-NEXT: fmov x0, d0
143-
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
124+
; CHECK-NEXT: ptrue p0.b
125+
; CHECK-NEXT: uaddv d1, p0, z1.b
126+
; CHECK-NEXT: uaddv d0, p0, z0.b
127+
; CHECK-NEXT: uaddv d2, p0, z2.b
128+
; CHECK-NEXT: fmov w8, s1
129+
; CHECK-NEXT: fmov w9, s0
130+
; CHECK-NEXT: add w8, w9, w8
131+
; CHECK-NEXT: fmov w9, s2
132+
; CHECK-NEXT: add w0, w8, w9
144133
; CHECK-NEXT: ret
145134
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
146135
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>

llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12

23
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256
34
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON
@@ -6,24 +7,31 @@
67

78
define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) {
89
; SVE256-LABEL: test:
9-
; SVE256: ld1b { z0.h }, p0/z,
10-
; SVE256: ld1b { z1.h }, p0/z,
11-
; SVE256: sub z0.h, z0.h, z1.h
12-
; SVE256-NEXT: sunpklo z1.s, z0.h
13-
; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16
14-
; SVE256-NEXT: sunpklo z0.s, z0.h
15-
; SVE256-NEXT: add z0.s, z1.s, z0.s
16-
; SVE256-NEXT: uaddv d0, p1, z0.s
10+
; SVE256: // %bb.0: // %L.entry
11+
; SVE256-NEXT: ptrue p0.h, vl16
12+
; SVE256-NEXT: mov w9, wzr
13+
; SVE256-NEXT: mov w10, wzr
14+
; SVE256-NEXT: mov w8, wzr
15+
; SVE256-NEXT: mov w11, #-16 // =0xfffffff0
16+
; SVE256-NEXT: .p2align 5, , 16
17+
; SVE256-NEXT: .LBB0_1: // %L1
18+
; SVE256-NEXT: // =>This Inner Loop Header: Depth=1
19+
; SVE256-NEXT: sxtw x12, w9
20+
; SVE256-NEXT: sxtw x13, w10
21+
; SVE256-NEXT: adds w11, w11, #1
22+
; SVE256-NEXT: add w10, w10, w3
23+
; SVE256-NEXT: ld1b { z0.h }, p0/z, [x0, x12]
24+
; SVE256-NEXT: ld1b { z1.h }, p0/z, [x2, x13]
25+
; SVE256-NEXT: add w9, w9, w1
26+
; SVE256-NEXT: sub z0.h, z0.h, z1.h
27+
; SVE256-NEXT: saddv d0, p0, z0.h
28+
; SVE256-NEXT: fmov w12, s0
29+
; SVE256-NEXT: add w8, w12, w8
30+
; SVE256-NEXT: b.lo .LBB0_1
31+
; SVE256-NEXT: // %bb.2: // %L2
32+
; SVE256-NEXT: mov w0, w8
33+
; SVE256-NEXT: ret
1734

18-
; NEON-LABEL: test:
19-
; NEON: ldr q0, [x0, w9, sxtw]
20-
; NEON: ldr q1, [x2, w10, sxtw]
21-
; NEON: usubl2 v2.8h, v0.16b, v1.16b
22-
; NEON-NEXT: usubl v0.8h, v0.8b, v1.8b
23-
; NEON: saddl2 v1.4s, v0.8h, v2.8h
24-
; NEON-NEXT: saddl v0.4s, v0.4h, v2.4h
25-
; NEON-NEXT: add v0.4s, v0.4s, v1.4s
26-
; NEON-NEXT: addv s0, v0.4s
2735

2836
L.entry:
2937
br label %L1
@@ -55,3 +63,5 @@ L2: ; preds = %L1
5563
}
5664

5765
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
66+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
67+
; NEON: {{.*}}

llvm/test/CodeGen/AArch64/sve-int-reduce.ll

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,94 @@ define i64 @uaddv_nxv2i64(<vscale x 2 x i64> %a) {
188188
ret i64 %res
189189
}
190190

191+
define i32 @uaddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
192+
; CHECK-LABEL: uaddv_nxv16i8_nxv16i32:
193+
; CHECK: // %bb.0:
194+
; CHECK-NEXT: ptrue p0.b
195+
; CHECK-NEXT: uaddv d0, p0, z0.b
196+
; CHECK-NEXT: fmov x0, d0
197+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
198+
; CHECK-NEXT: ret
199+
%1 = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
200+
%2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
201+
ret i32 %2
202+
}
203+
204+
define i64 @uaddv_nxv16i16_nxv16i64(<vscale x 16 x i16> %a) {
205+
; CHECK-LABEL: uaddv_nxv16i16_nxv16i64:
206+
; CHECK: // %bb.0:
207+
; CHECK-NEXT: ptrue p0.h
208+
; CHECK-NEXT: uaddv d1, p0, z1.h
209+
; CHECK-NEXT: uaddv d0, p0, z0.h
210+
; CHECK-NEXT: fmov x8, d1
211+
; CHECK-NEXT: fmov x9, d0
212+
; CHECK-NEXT: add x0, x9, x8
213+
; CHECK-NEXT: ret
214+
%1 = zext <vscale x 16 x i16> %a to <vscale x 16 x i64>
215+
%2 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> %1)
216+
ret i64 %2
217+
}
218+
219+
define i32 @uaddv_nxv16i16_nxv16i32(<vscale x 32 x i16> %a) {
220+
; CHECK-LABEL: uaddv_nxv16i16_nxv16i32:
221+
; CHECK: // %bb.0:
222+
; CHECK-NEXT: ptrue p0.h
223+
; CHECK-NEXT: uaddv d3, p0, z3.h
224+
; CHECK-NEXT: uaddv d2, p0, z2.h
225+
; CHECK-NEXT: uaddv d1, p0, z1.h
226+
; CHECK-NEXT: uaddv d0, p0, z0.h
227+
; CHECK-NEXT: fmov w8, s3
228+
; CHECK-NEXT: fmov w9, s2
229+
; CHECK-NEXT: fmov w10, s1
230+
; CHECK-NEXT: fmov w11, s0
231+
; CHECK-NEXT: add w8, w9, w8
232+
; CHECK-NEXT: add w9, w11, w10
233+
; CHECK-NEXT: add w0, w9, w8
234+
; CHECK-NEXT: ret
235+
%1 = zext <vscale x 32 x i16> %a to <vscale x 32 x i32>
236+
%2 = call i32 @llvm.vector.reduce.add.nxv32i64(<vscale x 32 x i32> %1)
237+
ret i32 %2
238+
}
239+
240+
define i32 @saddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
241+
; CHECK-LABEL: saddv_nxv16i8_nxv16i32:
242+
; CHECK: // %bb.0:
243+
; CHECK-NEXT: ptrue p0.b
244+
; CHECK-NEXT: saddv d0, p0, z0.b
245+
; CHECK-NEXT: fmov x0, d0
246+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
247+
; CHECK-NEXT: ret
248+
%1 = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
249+
%2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
250+
ret i32 %2
251+
}
252+
253+
define i32 @uaddv_nxv32i16_nxv32i32(ptr %a) {
254+
; CHECK-LABEL: uaddv_nxv32i16_nxv32i32:
255+
; CHECK: // %bb.0:
256+
; CHECK-NEXT: ptrue p0.h
257+
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #3, mul vl]
258+
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #2, mul vl]
259+
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, #1, mul vl]
260+
; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0]
261+
; CHECK-NEXT: uaddv d0, p0, z0.h
262+
; CHECK-NEXT: uaddv d1, p0, z1.h
263+
; CHECK-NEXT: uaddv d2, p0, z2.h
264+
; CHECK-NEXT: uaddv d3, p0, z3.h
265+
; CHECK-NEXT: fmov w8, s0
266+
; CHECK-NEXT: fmov w9, s1
267+
; CHECK-NEXT: fmov w10, s2
268+
; CHECK-NEXT: fmov w11, s3
269+
; CHECK-NEXT: add w8, w9, w8
270+
; CHECK-NEXT: add w9, w11, w10
271+
; CHECK-NEXT: add w0, w9, w8
272+
; CHECK-NEXT: ret
273+
%1 = load <vscale x 32 x i16>, ptr %a, align 16
274+
%2 = zext <vscale x 32 x i16> %1 to <vscale x 32 x i32>
275+
%3 = call i32 @llvm.vector.reduce.add.nxv32i32(<vscale x 32 x i32> %2)
276+
ret i32 %3
277+
}
278+
191279
; UMINV
192280

193281
define i8 @umin_nxv16i8(<vscale x 16 x i8> %a) {

0 commit comments

Comments
 (0)