Skip to content

Commit 6ac7c16

Browse files
committed
[AArch64][SVE] Refactor getPTrue to return splat(1) when pattern=all.
Similarly to llvm#135016, refactor getPTrue to return splat (1) for all-active patterns. The main motivation for this is to improve code gen for fixed-length vector loads/stores that are lowered to SVE masked memory ops when they are wider than Neon. Emitting the mask as a splat helps DAGCombiner simplify all-active masked loads/stores into unmaked ones, for which we already have suitable patterns. There are four places in AArch64ISelLowering that match against AArch64ISD::PTRUE opcodes explicitly. Of these, only one (foldCSELofLASTB) led to test regressions, which I addressed by adding a check for ISD::isConstantSplatVectorAllOnes (I'm not sure if the intent here is to genuinely match any PTRUE node, or if isAllActivePredicate should be used instead). The other three combines (performUnpackCombine, performMSTORECombine and performSetCCPunpkCombine) check for patterns in the range [VL1, VL256], so those should already skip all-active masks. Given the recent changes, going this route seemed more sensible than replicating the combines from DAGCombiner or adding patterns for all-active masked loads/stores, but I'm happy pursuing either of these approaches (or any other) if they are seen as more appropriate.
1 parent 9c88b6d commit 6ac7c16

21 files changed

+204
-266
lines changed

clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ void func(int *restrict a, int *restrict b) {
1616
// CHECK256-COUNT-8: str
1717
// CHECK512-COUNT-4: str
1818
// CHECK1024-COUNT-2: str
19-
// CHECK2048-COUNT-1: st1w
19+
// CHECK2048-COUNT-1: str
2020
#pragma clang loop vectorize(enable)
2121
for (int i = 0; i < 64; ++i)
2222
a[i] += b[i];

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5725,8 +5725,8 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
57255725

57265726
static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
57275727
int Pattern) {
5728-
if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5729-
return DAG.getConstant(1, DL, MVT::nxv1i1);
5728+
if (Pattern == AArch64SVEPredPattern::all)
5729+
return DAG.getConstant(1, DL, VT);
57305730
return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
57315731
DAG.getTargetConstant(Pattern, DL, MVT::i32));
57325732
}
@@ -25030,7 +25030,8 @@ static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG) {
2503025030
if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
2503125031
AnyPred = AnyPred.getOperand(0);
2503225032

25033-
if (TruePred != AnyPred && TruePred.getOpcode() != AArch64ISD::PTRUE)
25033+
if (TruePred != AnyPred && TruePred.getOpcode() != AArch64ISD::PTRUE &&
25034+
!ISD::isConstantSplatVectorAllOnes(TruePred.getNode()))
2503425035
return SDValue();
2503525036

2503625037
SDValue LastB = Op->getOperand(0);
@@ -28568,7 +28569,7 @@ static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
2856828569
}
2856928570
}
2857028571

28571-
// Return a PTRUE with active lanes corresponding to the extent of VT.
28572+
// Return a predicate with active lanes corresponding to the extent of VT.
2857228573
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
2857328574
EVT VT) {
2857428575
assert(VT.isFixedLengthVector() &&

llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -208,13 +208,8 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
208208
; CHECK: // %bb.0:
209209
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
210210
; CHECK-NEXT: addvl sp, sp, #-1
211-
; CHECK-NEXT: ptrue p0.d
212-
; CHECK-NEXT: ptrue p1.d, vl8
213-
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
214-
; CHECK-NEXT: str z0, [sp]
215-
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1]
216-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
217-
; CHECK-NEXT: ldr z0, [sp]
211+
; CHECK-NEXT: ptrue p0.d, vl8
212+
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
218213
; CHECK-NEXT: addvl sp, sp, #1
219214
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
220215
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,8 +331,7 @@ define void @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, ptr %p) nounwi
331331
; CHECK-LABEL: extract_fixed_v4i64_nxv2i64:
332332
; CHECK: // %bb.0:
333333
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #32
334-
; CHECK-NEXT: ptrue p0.d
335-
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
334+
; CHECK-NEXT: str z0, [x0]
336335
; CHECK-NEXT: ret
337336
%retval = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> %vec, i64 4)
338337
store <4 x i64> %retval, ptr %p

llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ define void @st1d_fixed(ptr %ptr) #0 {
1818
; CHECK-NEXT: ptrue p0.d
1919
; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x20]
2020
; CHECK-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload
21-
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
21+
; CHECK-NEXT: str z0, [x19]
2222
; CHECK-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload
2323
; CHECK-NEXT: add sp, sp, #160
2424
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -544,11 +544,10 @@ define void @extract_subvector_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
544544
define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 {
545545
; CHECK-LABEL: extract_subvector_legalization_v8i32:
546546
; CHECK: // %bb.0: // %entry
547-
; CHECK-NEXT: ptrue p0.s
548547
; CHECK-NEXT: adrp x8, .LCPI40_0
549548
; CHECK-NEXT: add x8, x8, :lo12:.LCPI40_0
550549
; CHECK-NEXT: ptrue p1.d
551-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
550+
; CHECK-NEXT: ldr z0, [x8]
552551
; CHECK-NEXT: mov z1.d, z0.d
553552
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
554553
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-convert.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ target triple = "aarch64-unknown-linux-gnu"
77
define void @fp_convert_combine_crash(ptr %a, ptr %b) #0 {
88
; CHECK-LABEL: fp_convert_combine_crash:
99
; CHECK: // %bb.0:
10+
; CHECK-NEXT: fmov z0.s, #8.00000000
11+
; CHECK-NEXT: ldr z1, [x0]
1012
; CHECK-NEXT: ptrue p0.s
11-
; CHECK-NEXT: fmov z1.s, #8.00000000
12-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
13-
; CHECK-NEXT: fmul z0.s, z0.s, z1.s
13+
; CHECK-NEXT: fmul z0.s, z1.s, z0.s
1414
; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
15-
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
15+
; CHECK-NEXT: str z0, [x1]
1616
; CHECK-NEXT: ret
1717
%f = load <8 x float>, ptr %a
1818
%mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,

llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,32 +20,31 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v
2020
; CHECK-NEXT: .cfi_offset w21, -24
2121
; CHECK-NEXT: .cfi_offset w22, -32
2222
; CHECK-NEXT: .cfi_offset w29, -48
23-
; CHECK-NEXT: ptrue p0.d
2423
; CHECK-NEXT: add x10, sp, #176
2524
; CHECK-NEXT: add x8, sp, #48
2625
; CHECK-NEXT: add x9, sp, #144
27-
; CHECK-NEXT: add x20, sp, #176
28-
; CHECK-NEXT: ldr x15, [sp, #104]
29-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x10]
30-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
26+
; CHECK-NEXT: ldr z3, [x10]
27+
; CHECK-NEXT: ldr z0, [x8]
3128
; CHECK-NEXT: add x8, sp, #112
32-
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9]
33-
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8]
34-
; CHECK-NEXT: ldur q4, [sp, #88]
29+
; CHECK-NEXT: ldr z2, [x9]
30+
; CHECK-NEXT: ldr z1, [x8]
31+
; CHECK-NEXT: add x20, sp, #176
3532
; CHECK-NEXT: ldp x9, x8, [sp, #328]
36-
; CHECK-NEXT: ldr x19, [sp, #272]
33+
; CHECK-NEXT: ldr x15, [sp, #104]
3734
; CHECK-NEXT: ldp x11, x10, [sp, #312]
35+
; CHECK-NEXT: ldur q4, [sp, #88]
3836
; CHECK-NEXT: ldp x13, x12, [sp, #296]
37+
; CHECK-NEXT: ldr x19, [sp, #272]
3938
; CHECK-NEXT: ldp x18, x14, [sp, #280]
4039
; CHECK-NEXT: ldp x16, x17, [sp, #208]
4140
; CHECK-NEXT: ldp x21, x22, [sp, #352]
42-
; CHECK-NEXT: st1d { z3.d }, p0, [x20]
41+
; CHECK-NEXT: str z3, [x20]
4342
; CHECK-NEXT: add x20, sp, #144
44-
; CHECK-NEXT: st1d { z2.d }, p0, [x20]
43+
; CHECK-NEXT: str z2, [x20]
4544
; CHECK-NEXT: add x20, sp, #112
46-
; CHECK-NEXT: st1d { z1.d }, p0, [x20]
45+
; CHECK-NEXT: str z1, [x20]
4746
; CHECK-NEXT: add x20, sp, #48
48-
; CHECK-NEXT: st1d { z0.d }, p0, [x20]
47+
; CHECK-NEXT: str z0, [x20]
4948
; CHECK-NEXT: stp x21, x22, [sp, #352]
5049
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
5150
; CHECK-NEXT: stp x19, x18, [sp, #272]

llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
22
; RUN: llc -debug-only=isel < %s 2>&1 | FileCheck %s
33

44
; REQUIRES: asserts
@@ -9,16 +9,15 @@ target triple = "aarch64-unknown-linux-gnu"
99
; accessing fixed width objects.
1010
define void @foo(ptr %a) #0 {
1111
; CHECK-LABEL: foo:
12-
; CHECK: SelectionDAG has 15 nodes:
12+
; CHECK: SelectionDAG has 13 nodes:
1313
; CHECK-NEXT: t0: ch,glue = EntryToken
14-
; CHECK-NEXT: t12: nxv2i1 = PTRUE_D TargetConstant:i32<31>
1514
; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0
16-
; CHECK-NEXT: t18: nxv2i64,ch = LD1D_IMM<Mem:(volatile load (s512) from %ir.a)> t12, t2, TargetConstant:i64<0>, t0
15+
; CHECK-NEXT: t21: nxv2i64,ch = LDR_ZXI<Mem:(volatile load (<vscale x 1 x s128>) from %ir.a, align 64)> t2, TargetConstant:i64<0>, t0
1716
; CHECK-NEXT: t8: i64 = ADDXri TargetFrameIndex:i64<1>, TargetConstant:i32<0>, TargetConstant:i32<0>
1817
; CHECK-NEXT: t6: i64 = ADDXri TargetFrameIndex:i64<0>, TargetConstant:i32<0>, TargetConstant:i32<0>
19-
; CHECK-NEXT: t17: ch = ST1D_IMM<Mem:(volatile store (s512) into %ir.r0)> t18, t12, t6, TargetConstant:i64<0>, t18:1
20-
; CHECK-NEXT: t16: ch = ST1D_IMM<Mem:(volatile store (s512) into %ir.r1)> t18, t12, t8, TargetConstant:i64<0>, t17
21-
; CHECK-NEXT: t10: ch = RET_ReallyLR t16
18+
; CHECK-NEXT: t22: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r0, align 64)> t21, t6, TargetConstant:i64<0>, t21:1
19+
; CHECK-NEXT: t23: ch = STR_ZXI<Mem:(volatile store (<vscale x 1 x s128>) into %ir.r1, align 64)> t21, t8, TargetConstant:i64<0>, t22
20+
; CHECK-NEXT: t10: ch = RET_ReallyLR t23
2221
; CHECK-EMPTY:
2322
entry:
2423
%r0 = alloca <8 x i64>

llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -380,11 +380,10 @@ define void @v8i32(ptr %ldptr, ptr %stptr) {
380380
;
381381
; CHECK-256-LABEL: v8i32:
382382
; CHECK-256: // %bb.0:
383-
; CHECK-256-NEXT: ptrue p0.s
384-
; CHECK-256-NEXT: ld1w { z0.s }, p0/z, [x0, #2, mul vl]
385-
; CHECK-256-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
386-
; CHECK-256-NEXT: st1w { z0.s }, p0, [x1, #2, mul vl]
387-
; CHECK-256-NEXT: st1w { z1.s }, p0, [x1, #1, mul vl]
383+
; CHECK-256-NEXT: ldr z0, [x0, #2, mul vl]
384+
; CHECK-256-NEXT: ldr z1, [x0, #1, mul vl]
385+
; CHECK-256-NEXT: str z0, [x1, #2, mul vl]
386+
; CHECK-256-NEXT: str z1, [x1, #1, mul vl]
388387
; CHECK-256-NEXT: ret
389388
;
390389
; CHECK-512-LABEL: v8i32:
@@ -437,8 +436,7 @@ define void @v8i32_vscale(ptr %0) {
437436
; CHECK-256-LABEL: v8i32_vscale:
438437
; CHECK-256: // %bb.0:
439438
; CHECK-256-NEXT: mov z0.s, #1 // =0x1
440-
; CHECK-256-NEXT: ptrue p0.s
441-
; CHECK-256-NEXT: st1w { z0.s }, p0, [x0, #2, mul vl]
439+
; CHECK-256-NEXT: str z0, [x0, #2, mul vl]
442440
; CHECK-256-NEXT: ret
443441
;
444442
; CHECK-512-LABEL: v8i32_vscale:

0 commit comments

Comments
 (0)