Skip to content

Commit af03d6b

Browse files
authored
[AArch64][SVE] Refactor getPTrue to return splat(1) when pattern=all. (#139236)
Similarly to #135016, refactor getPTrue to return splat (1) for all-active patterns. The main motivation for this is to improve code gen for fixed-length vector loads/stores that are converted to SVE masked memory ops when the vectors are wider than Neon. Emitting the mask as a splat helps DAGCombiner simplify all-active masked loads/stores into unmaked ones, for which it already has suitable combines and ISel has suitable patterns.
1 parent d27d0c7 commit af03d6b

21 files changed

+203
-266
lines changed

clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ void func(int *restrict a, int *restrict b) {
1616
// CHECK256-COUNT-8: str
1717
// CHECK512-COUNT-4: str
1818
// CHECK1024-COUNT-2: str
19-
// CHECK2048-COUNT-1: st1w
19+
// CHECK2048-COUNT-1: str
2020
#pragma clang loop vectorize(enable)
2121
for (int i = 0; i < 64; ++i)
2222
a[i] += b[i];

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5725,8 +5725,8 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
57255725

57265726
static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
57275727
int Pattern) {
5728-
if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5729-
return DAG.getConstant(1, DL, MVT::nxv1i1);
5728+
if (Pattern == AArch64SVEPredPattern::all)
5729+
return DAG.getConstant(1, DL, VT);
57305730
return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
57315731
DAG.getTargetConstant(Pattern, DL, MVT::i32));
57325732
}
@@ -25030,7 +25030,7 @@ static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG) {
2503025030
if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
2503125031
AnyPred = AnyPred.getOperand(0);
2503225032

25033-
if (TruePred != AnyPred && TruePred.getOpcode() != AArch64ISD::PTRUE)
25033+
if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
2503425034
return SDValue();
2503525035

2503625036
SDValue LastB = Op->getOperand(0);
@@ -28568,7 +28568,7 @@ static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
2856828568
}
2856928569
}
2857028570

28571-
// Return a PTRUE with active lanes corresponding to the extent of VT.
28571+
// Return a predicate with active lanes corresponding to the extent of VT.
2857228572
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
2857328573
EVT VT) {
2857428574
assert(VT.isFixedLengthVector() &&

llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -208,13 +208,8 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
208208
; CHECK: // %bb.0:
209209
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
210210
; CHECK-NEXT: addvl sp, sp, #-1
211-
; CHECK-NEXT: ptrue p0.d
212-
; CHECK-NEXT: ptrue p1.d, vl8
213-
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
214-
; CHECK-NEXT: str z0, [sp]
215-
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1]
216-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
217-
; CHECK-NEXT: ldr z0, [sp]
211+
; CHECK-NEXT: ptrue p0.d, vl8
212+
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
218213
; CHECK-NEXT: addvl sp, sp, #1
219214
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
220215
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,8 +331,7 @@ define void @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, ptr %p) nounwi
331331
; CHECK-LABEL: extract_fixed_v4i64_nxv2i64:
332332
; CHECK: // %bb.0:
333333
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #32
334-
; CHECK-NEXT: ptrue p0.d
335-
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
334+
; CHECK-NEXT: str z0, [x0]
336335
; CHECK-NEXT: ret
337336
%retval = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> %vec, i64 4)
338337
store <4 x i64> %retval, ptr %p

llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ define void @st1d_fixed(ptr %ptr) #0 {
1818
; CHECK-NEXT: ptrue p0.d
1919
; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x20]
2020
; CHECK-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload
21-
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
21+
; CHECK-NEXT: str z0, [x19]
2222
; CHECK-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload
2323
; CHECK-NEXT: add sp, sp, #160
2424
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -544,11 +544,10 @@ define void @extract_subvector_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
544544
define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 {
545545
; CHECK-LABEL: extract_subvector_legalization_v8i32:
546546
; CHECK: // %bb.0: // %entry
547-
; CHECK-NEXT: ptrue p0.s
548547
; CHECK-NEXT: adrp x8, .LCPI40_0
549548
; CHECK-NEXT: add x8, x8, :lo12:.LCPI40_0
550549
; CHECK-NEXT: ptrue p1.d
551-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
550+
; CHECK-NEXT: ldr z0, [x8]
552551
; CHECK-NEXT: mov z1.d, z0.d
553552
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
554553
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-convert.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ target triple = "aarch64-unknown-linux-gnu"
77
define void @fp_convert_combine_crash(ptr %a, ptr %b) #0 {
88
; CHECK-LABEL: fp_convert_combine_crash:
99
; CHECK: // %bb.0:
10+
; CHECK-NEXT: fmov z0.s, #8.00000000
11+
; CHECK-NEXT: ldr z1, [x0]
1012
; CHECK-NEXT: ptrue p0.s
11-
; CHECK-NEXT: fmov z1.s, #8.00000000
12-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
13-
; CHECK-NEXT: fmul z0.s, z0.s, z1.s
13+
; CHECK-NEXT: fmul z0.s, z1.s, z0.s
1414
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
15-
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
15+
; CHECK-NEXT: str z0, [x1]
1616
; CHECK-NEXT: ret
1717
%f = load <8 x float>, ptr %a
1818
%mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,

llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,32 +20,31 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v
2020
; CHECK-NEXT: .cfi_offset w21, -24
2121
; CHECK-NEXT: .cfi_offset w22, -32
2222
; CHECK-NEXT: .cfi_offset w29, -48
23-
; CHECK-NEXT: ptrue p0.d
2423
; CHECK-NEXT: add x10, sp, #176
2524
; CHECK-NEXT: add x8, sp, #48
2625
; CHECK-NEXT: add x9, sp, #144
27-
; CHECK-NEXT: add x20, sp, #176
28-
; CHECK-NEXT: ldr x15, [sp, #104]
29-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x10]
30-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
26+
; CHECK-NEXT: ldr z3, [x10]
27+
; CHECK-NEXT: ldr z0, [x8]
3128
; CHECK-NEXT: add x8, sp, #112
32-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9]
33-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8]
34-
; CHECK-NEXT: ldur q4, [sp, #88]
29+
; CHECK-NEXT: ldr z2, [x9]
30+
; CHECK-NEXT: ldr z1, [x8]
31+
; CHECK-NEXT: add x20, sp, #176
3532
; CHECK-NEXT: ldp x9, x8, [sp, #328]
36-
; CHECK-NEXT: ldr x19, [sp, #272]
33+
; CHECK-NEXT: ldr x15, [sp, #104]
3734
; CHECK-NEXT: ldp x11, x10, [sp, #312]
35+
; CHECK-NEXT: ldur q4, [sp, #88]
3836
; CHECK-NEXT: ldp x13, x12, [sp, #296]
37+
; CHECK-NEXT: ldr x19, [sp, #272]
3938
; CHECK-NEXT: ldp x18, x14, [sp, #280]
4039
; CHECK-NEXT: ldp x16, x17, [sp, #208]
4140
; CHECK-NEXT: ldp x21, x22, [sp, #352]
42-
; CHECK-NEXT: st1d { z3.d }, p0, [x20]
41+
; CHECK-NEXT: str z3, [x20]
4342
; CHECK-NEXT: add x20, sp, #144
44-
; CHECK-NEXT: st1d { z2.d }, p0, [x20]
43+
; CHECK-NEXT: str z2, [x20]
4544
; CHECK-NEXT: add x20, sp, #112
46-
; CHECK-NEXT: st1d { z1.d }, p0, [x20]
45+
; CHECK-NEXT: str z1, [x20]
4746
; CHECK-NEXT: add x20, sp, #48
48-
; CHECK-NEXT: st1d { z0.d }, p0, [x20]
47+
; CHECK-NEXT: str z0, [x20]
4948
; CHECK-NEXT: stp x21, x22, [sp, #352]
5049
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
5150
; CHECK-NEXT: stp x19, x18, [sp, #272]

llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
22
; RUN: llc -debug-only=isel < %s 2>&1 | FileCheck %s
33

44
; REQUIRES: asserts
@@ -9,16 +9,15 @@ target triple = "aarch64-unknown-linux-gnu"
99
; accessing fixed width objects.
1010
define void @foo(ptr %a) #0 {
1111
; CHECK-LABEL: foo:
12-
; CHECK: SelectionDAG has 15 nodes:
12+
; CHECK: SelectionDAG has 13 nodes:
1313
; CHECK-NEXT: t0: ch,glue = EntryToken
14-
; CHECK-NEXT: t12: nxv2i1 = PTRUE_D TargetConstant:i32<31>
1514
; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0
16-
; CHECK-NEXT: t18: nxv2i64,ch = LD1D_IMM<Mem:(volatile load (s512) from %ir.a)> t12, t2, TargetConstant:i64<0>, t0
15+
; CHECK-NEXT: t21: nxv2i64,ch = LDR_ZXI<Mem:(volatile load (<vscale x 1 x s128>) from %ir.a, align 64)> t2, TargetConstant:i64<0>, t0
1716
; CHECK-NEXT: t8: i64 = ADDXri TargetFrameIndex:i64<1>, TargetConstant:i32<0>, TargetConstant:i32<0>
1817
; CHECK-NEXT: t6: i64 = ADDXri TargetFrameIndex:i64<0>, TargetConstant:i32<0>, TargetConstant:i32<0>
19-
; CHECK-NEXT: t17: ch = ST1D_IMM<Mem:(volatile store (s512) into %ir.r0)> t18, t12, t6, TargetConstant:i64<0>, t18:1
20-
; CHECK-NEXT: t16: ch = ST1D_IMM<Mem:(volatile store (s512) into %ir.r1)> t18, t12, t8, TargetConstant:i64<0>, t17
21-
; CHECK-NEXT: t10: ch = RET_ReallyLR t16
18+
; CHECK-NEXT: t22: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r0, align 64)> t21, t6, TargetConstant:i64<0>, t21:1
19+
; CHECK-NEXT: t23: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r1, align 64)> t21, t8, TargetConstant:i64<0>, t22
20+
; CHECK-NEXT: t10: ch = RET_ReallyLR t23
2221
; CHECK-EMPTY:
2322
entry:
2423
%r0 = alloca <8 x i64>

llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -380,11 +380,10 @@ define void @v8i32(ptr %ldptr, ptr %stptr) {
380380
;
381381
; CHECK-256-LABEL: v8i32:
382382
; CHECK-256: // %bb.0:
383-
; CHECK-256-NEXT: ptrue p0.s
384-
; CHECK-256-NEXT: ld1w { z0.s }, p0/z, [x0, #2, mul vl]
385-
; CHECK-256-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
386-
; CHECK-256-NEXT: st1w { z0.s }, p0, [x1, #2, mul vl]
387-
; CHECK-256-NEXT: st1w { z1.s }, p0, [x1, #1, mul vl]
383+
; CHECK-256-NEXT: ldr z0, [x0, #2, mul vl]
384+
; CHECK-256-NEXT: ldr z1, [x0, #1, mul vl]
385+
; CHECK-256-NEXT: str z0, [x1, #2, mul vl]
386+
; CHECK-256-NEXT: str z1, [x1, #1, mul vl]
388387
; CHECK-256-NEXT: ret
389388
;
390389
; CHECK-512-LABEL: v8i32:
@@ -437,8 +436,7 @@ define void @v8i32_vscale(ptr %0) {
437436
; CHECK-256-LABEL: v8i32_vscale:
438437
; CHECK-256: // %bb.0:
439438
; CHECK-256-NEXT: mov z0.s, #1 // =0x1
440-
; CHECK-256-NEXT: ptrue p0.s
441-
; CHECK-256-NEXT: st1w { z0.s }, p0, [x0, #2, mul vl]
439+
; CHECK-256-NEXT: str z0, [x0, #2, mul vl]
442440
; CHECK-256-NEXT: ret
443441
;
444442
; CHECK-512-LABEL: v8i32_vscale:

llvm/test/CodeGen/AArch64/sve-fixed-length-optimize-ptrue.ll

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@ target triple = "aarch64-unknown-linux-gnu"
66
define void @add_v64i8(ptr %a, ptr %b) #0 {
77
; CHECK-LABEL: add_v64i8:
88
; CHECK: // %bb.0:
9-
; CHECK-NEXT: ptrue p0.b
10-
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
11-
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
9+
; CHECK-NEXT: ldr z0, [x0]
10+
; CHECK-NEXT: ldr z1, [x1]
1211
; CHECK-NEXT: add z0.b, z0.b, z1.b
13-
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
12+
; CHECK-NEXT: str z0, [x0]
1413
; CHECK-NEXT: ret
1514
%op1 = load <64 x i8>, ptr %a
1615
%op2 = load <64 x i8>, ptr %b
@@ -22,11 +21,10 @@ define void @add_v64i8(ptr %a, ptr %b) #0 {
2221
define void @add_v32i16(ptr %a, ptr %b, ptr %c) #0 {
2322
; CHECK-LABEL: add_v32i16:
2423
; CHECK: // %bb.0:
25-
; CHECK-NEXT: ptrue p0.h
26-
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
27-
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
24+
; CHECK-NEXT: ldr z0, [x0]
25+
; CHECK-NEXT: ldr z1, [x1]
2826
; CHECK-NEXT: add z0.h, z0.h, z1.h
29-
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
27+
; CHECK-NEXT: str z0, [x0]
3028
; CHECK-NEXT: ret
3129
%op1 = load <32 x i16>, ptr %a
3230
%op2 = load <32 x i16>, ptr %b
@@ -38,10 +36,10 @@ define void @add_v32i16(ptr %a, ptr %b, ptr %c) #0 {
3836
define void @abs_v16i32(ptr %a) #0 {
3937
; CHECK-LABEL: abs_v16i32:
4038
; CHECK: // %bb.0:
39+
; CHECK-NEXT: ldr z0, [x0]
4140
; CHECK-NEXT: ptrue p0.s
42-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
4341
; CHECK-NEXT: abs z0.s, p0/m, z0.s
44-
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
42+
; CHECK-NEXT: str z0, [x0]
4543
; CHECK-NEXT: ret
4644
%op1 = load <16 x i32>, ptr %a
4745
%res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
@@ -52,10 +50,10 @@ define void @abs_v16i32(ptr %a) #0 {
5250
define void @abs_v8i64(ptr %a) #0 {
5351
; CHECK-LABEL: abs_v8i64:
5452
; CHECK: // %bb.0:
53+
; CHECK-NEXT: ldr z0, [x0]
5554
; CHECK-NEXT: ptrue p0.d
56-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
5755
; CHECK-NEXT: abs z0.d, p0/m, z0.d
58-
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
56+
; CHECK-NEXT: str z0, [x0]
5957
; CHECK-NEXT: ret
6058
%op1 = load <8 x i64>, ptr %a
6159
%res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
@@ -66,11 +64,10 @@ define void @abs_v8i64(ptr %a) #0 {
6664
define void @fadd_v32f16(ptr %a, ptr %b) #0 {
6765
; CHECK-LABEL: fadd_v32f16:
6866
; CHECK: // %bb.0:
69-
; CHECK-NEXT: ptrue p0.h
70-
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
71-
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
67+
; CHECK-NEXT: ldr z0, [x0]
68+
; CHECK-NEXT: ldr z1, [x1]
7269
; CHECK-NEXT: fadd z0.h, z0.h, z1.h
73-
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
70+
; CHECK-NEXT: str z0, [x0]
7471
; CHECK-NEXT: ret
7572
%op1 = load <32 x half>, ptr %a
7673
%op2 = load <32 x half>, ptr %b
@@ -82,11 +79,10 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 {
8279
define void @fadd_v16f32(ptr %a, ptr %b) #0 {
8380
; CHECK-LABEL: fadd_v16f32:
8481
; CHECK: // %bb.0:
85-
; CHECK-NEXT: ptrue p0.s
86-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
87-
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
82+
; CHECK-NEXT: ldr z0, [x0]
83+
; CHECK-NEXT: ldr z1, [x1]
8884
; CHECK-NEXT: fadd z0.s, z0.s, z1.s
89-
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
85+
; CHECK-NEXT: str z0, [x0]
9086
; CHECK-NEXT: ret
9187
%op1 = load <16 x float>, ptr %a
9288
%op2 = load <16 x float>, ptr %b
@@ -98,11 +94,10 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 {
9894
define void @fadd_v8f64(ptr %a, ptr %b) #0 {
9995
; CHECK-LABEL: fadd_v8f64:
10096
; CHECK: // %bb.0:
101-
; CHECK-NEXT: ptrue p0.d
102-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
103-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
97+
; CHECK-NEXT: ldr z0, [x0]
98+
; CHECK-NEXT: ldr z1, [x1]
10499
; CHECK-NEXT: fadd z0.d, z0.d, z1.d
105-
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
100+
; CHECK-NEXT: str z0, [x0]
106101
; CHECK-NEXT: ret
107102
%op1 = load <8 x double>, ptr %a
108103
%op2 = load <8 x double>, ptr %b

0 commit comments

Comments
 (0)