Skip to content

Commit 2f9ed9d

Browse files
authored
[AArch64][SVE] Select non-temporal instructions for unpredicated loads/stores with the nontemporal flag (#171261)
Add patterns to select SVE non-temporal load/store instructions for unpredicated vector loads/stores with the `nontemporal` flag. Previously, regular instructions were used for these cases. Fixes #169034
1 parent d4e9355 commit 2f9ed9d

File tree

4 files changed

+317
-18
lines changed

4 files changed

+317
-18
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3063,14 +3063,14 @@ let Predicates = [HasSVE_or_SME] in {
30633063
multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegRegInst,
30643064
Instruction RegImmInst, Instruction PTrue,
30653065
ComplexPattern AddrCP> {
3066-
let AddedComplexity = 1 in {
3067-
def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)),
3068-
(RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
3069-
}
30703066
let AddedComplexity = 2 in {
30713067
def _imm : Pat<(Store Ty:$val, (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
30723068
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
30733069
}
3070+
let AddedComplexity = 1 in {
3071+
def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)),
3072+
(RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
3073+
}
30743074

30753075
def : Pat<(Store Ty:$val, GPR64:$base),
30763076
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
@@ -3096,17 +3096,28 @@ let Predicates = [HasSVE_or_SME] in {
30963096
defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
30973097
defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
30983098

3099+
let AddedComplexity = 3 in {
3100+
defm : unpred_store<nontemporalstore, nxv16i8, STNT1B_ZRR, STNT1B_ZRI, PTRUE_B, am_sve_regreg_lsl0>;
3101+
defm : unpred_store<nontemporalstore, nxv8i16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
3102+
defm : unpred_store<nontemporalstore, nxv4i32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
3103+
defm : unpred_store<nontemporalstore, nxv2i64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
3104+
defm : unpred_store<nontemporalstore, nxv8f16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
3105+
defm : unpred_store<nontemporalstore, nxv8bf16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
3106+
defm : unpred_store<nontemporalstore, nxv4f32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
3107+
defm : unpred_store<nontemporalstore, nxv2f64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
3108+
}
3109+
30993110
multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegRegInst,
31003111
Instruction RegImmInst, Instruction PTrue,
31013112
ComplexPattern AddrCP> {
3102-
let AddedComplexity = 1 in {
3103-
def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))),
3104-
(RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
3105-
}
31063113
let AddedComplexity = 2 in {
31073114
def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
31083115
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
31093116
}
3117+
let AddedComplexity = 1 in {
3118+
def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))),
3119+
(RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
3120+
}
31103121

31113122
def : Pat<(Ty (Load GPR64:$base)),
31123123
(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
@@ -3144,6 +3155,17 @@ let Predicates = [HasSVE_or_SME] in {
31443155
defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
31453156
defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
31463157

3158+
let AddedComplexity = 3 in {
3159+
defm : unpred_load<nontemporalload, nxv16i8, LDNT1B_ZRR, LDNT1B_ZRI, PTRUE_B, am_sve_regreg_lsl0>;
3160+
defm : unpred_load<nontemporalload, nxv8i16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
3161+
defm : unpred_load<nontemporalload, nxv4i32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
3162+
defm : unpred_load<nontemporalload, nxv2i64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
3163+
defm : unpred_load<nontemporalload, nxv8f16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
3164+
defm : unpred_load<nontemporalload, nxv8bf16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1>;
3165+
defm : unpred_load<nontemporalload, nxv4f32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2>;
3166+
defm : unpred_load<nontemporalload, nxv2f64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3>;
3167+
}
3168+
31473169
let Predicates = [HasSVE_or_SME, IsLE] in {
31483170
// Allow using the reg+reg form of ld1b/st1b for memory accesses with the
31493171
// same width as nxv16i8. This saves an add in cases where we would

llvm/test/CodeGen/AArch64/nontemporal-load.ll

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -612,21 +612,22 @@ define <16 x double> @test_ldnp_v16f64(ptr %A) {
612612
define <vscale x 20 x float> @test_ldnp_v20f32_vscale(ptr %A) {
613613
; CHECK-LABEL: test_ldnp_v20f32_vscale:
614614
; CHECK: ; %bb.0:
615-
; CHECK-NEXT: ldr z0, [x0]
616-
; CHECK-NEXT: ldr z1, [x0, #1, mul vl]
617-
; CHECK-NEXT: ldr z2, [x0, #2, mul vl]
618-
; CHECK-NEXT: ldr z3, [x0, #3, mul vl]
619-
; CHECK-NEXT: ldr z4, [x0, #4, mul vl]
615+
; CHECK-NEXT: ptrue p0.s
616+
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
617+
; CHECK-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
618+
; CHECK-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
619+
; CHECK-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
620+
; CHECK-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
620621
; CHECK-NEXT: ret
621622
;
622623
; CHECK-BE-LABEL: test_ldnp_v20f32_vscale:
623624
; CHECK-BE: // %bb.0:
624625
; CHECK-BE-NEXT: ptrue p0.s
625-
; CHECK-BE-NEXT: ld1w { z0.s }, p0/z, [x0]
626-
; CHECK-BE-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
627-
; CHECK-BE-NEXT: ld1w { z2.s }, p0/z, [x0, #2, mul vl]
628-
; CHECK-BE-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl]
629-
; CHECK-BE-NEXT: ld1w { z4.s }, p0/z, [x0, #4, mul vl]
626+
; CHECK-BE-NEXT: ldnt1w { z0.s }, p0/z, [x0]
627+
; CHECK-BE-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
628+
; CHECK-BE-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
629+
; CHECK-BE-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
630+
; CHECK-BE-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
630631
; CHECK-BE-NEXT: ret
631632
%lv = load<vscale x 20 x float>, ptr %A, align 8, !nontemporal !0
632633
ret <vscale x 20 x float> %lv
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefixes=CHECK,CHECK-LE %s
3+
; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve < %s | FileCheck --check-prefixes=CHECK,CHECK-BE %s
4+
5+
define <vscale x 16 x i8> @load_nxv16i8(ptr %a) nounwind {
6+
; CHECK-LABEL: load_nxv16i8:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: ptrue p0.b
9+
; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0]
10+
; CHECK-NEXT: ret
11+
%load = load <vscale x 16 x i8>, ptr %a, !nontemporal !0
12+
ret <vscale x 16 x i8> %load
13+
}
14+
15+
define <vscale x 8 x i16> @load_nxv8i16(ptr %a) nounwind {
16+
; CHECK-LABEL: load_nxv8i16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: ptrue p0.h
19+
; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
20+
; CHECK-NEXT: ret
21+
%load = load <vscale x 8 x i16>, ptr %a, !nontemporal !0
22+
ret <vscale x 8 x i16> %load
23+
}
24+
25+
define <vscale x 4 x i32> @load_nxv4i32(ptr %a) nounwind {
26+
; CHECK-LABEL: load_nxv4i32:
27+
; CHECK: // %bb.0:
28+
; CHECK-NEXT: ptrue p0.s
29+
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
30+
; CHECK-NEXT: ret
31+
%load = load <vscale x 4 x i32>, ptr %a, !nontemporal !0
32+
ret <vscale x 4 x i32> %load
33+
}
34+
35+
define <vscale x 2 x i64> @load_nxv2i64(ptr %a) nounwind {
36+
; CHECK-LABEL: load_nxv2i64:
37+
; CHECK: // %bb.0:
38+
; CHECK-NEXT: ptrue p0.d
39+
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
40+
; CHECK-NEXT: ret
41+
%load = load <vscale x 2 x i64>, ptr %a, !nontemporal !0
42+
ret <vscale x 2 x i64> %load
43+
}
44+
45+
define <vscale x 8 x half> @load_nxv8f16(ptr %a) nounwind {
46+
; CHECK-LABEL: load_nxv8f16:
47+
; CHECK: // %bb.0:
48+
; CHECK-NEXT: ptrue p0.h
49+
; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
50+
; CHECK-NEXT: ret
51+
%load = load <vscale x 8 x half>, ptr %a, !nontemporal !0
52+
ret <vscale x 8 x half> %load
53+
}
54+
55+
define <vscale x 8 x bfloat> @load_nxv8bf16(ptr %a) nounwind {
56+
; CHECK-LABEL: load_nxv8bf16:
57+
; CHECK: // %bb.0:
58+
; CHECK-NEXT: ptrue p0.h
59+
; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
60+
; CHECK-NEXT: ret
61+
%load = load <vscale x 8 x bfloat>, ptr %a, !nontemporal !0
62+
ret <vscale x 8 x bfloat> %load
63+
}
64+
65+
define <vscale x 4 x float> @load_nxv4f32(ptr %a) nounwind {
66+
; CHECK-LABEL: load_nxv4f32:
67+
; CHECK: // %bb.0:
68+
; CHECK-NEXT: ptrue p0.s
69+
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
70+
; CHECK-NEXT: ret
71+
%load = load <vscale x 4 x float>, ptr %a, !nontemporal !0
72+
ret <vscale x 4 x float> %load
73+
}
74+
75+
define <vscale x 2 x double> @load_nxv2f64(ptr %a) nounwind {
76+
; CHECK-LABEL: load_nxv2f64:
77+
; CHECK: // %bb.0:
78+
; CHECK-NEXT: ptrue p0.d
79+
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
80+
; CHECK-NEXT: ret
81+
%load = load <vscale x 2 x double>, ptr %a, !nontemporal !0
82+
ret <vscale x 2 x double> %load
83+
}
84+
85+
define <vscale x 16 x i8> @load_nxv16i8_reg(ptr %a, i64 %off) nounwind {
86+
; CHECK-LABEL: load_nxv16i8_reg:
87+
; CHECK: // %bb.0:
88+
; CHECK-NEXT: ptrue p0.b
89+
; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, x1]
90+
; CHECK-NEXT: ret
91+
%ptr = getelementptr i8, ptr %a, i64 %off
92+
%load = load <vscale x 16 x i8>, ptr %ptr, !nontemporal !0
93+
ret <vscale x 16 x i8> %load
94+
}
95+
96+
define <vscale x 16 x i8> @load_nxv16i8_imm(ptr %a) nounwind {
97+
; CHECK-LABEL: load_nxv16i8_imm:
98+
; CHECK: // %bb.0:
99+
; CHECK-NEXT: ptrue p0.b
100+
; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, #1, mul vl]
101+
; CHECK-NEXT: ret
102+
%ptr = getelementptr <vscale x 16 x i8>, ptr %a, i64 1
103+
%load = load <vscale x 16 x i8>, ptr %ptr, !nontemporal !0
104+
ret <vscale x 16 x i8> %load
105+
}
106+
107+
define <vscale x 2 x double> @load_nxv2f64_reg(ptr %a, i64 %off) nounwind {
108+
; CHECK-LABEL: load_nxv2f64_reg:
109+
; CHECK: // %bb.0:
110+
; CHECK-NEXT: ptrue p0.d
111+
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, x1, lsl #3]
112+
; CHECK-NEXT: ret
113+
%ptr = getelementptr double, ptr %a, i64 %off
114+
%load = load <vscale x 2 x double>, ptr %ptr, !nontemporal !0
115+
ret <vscale x 2 x double> %load
116+
}
117+
118+
define <vscale x 2 x double> @load_nxv2f64_imm(ptr %a) nounwind {
119+
; CHECK-LABEL: load_nxv2f64_imm:
120+
; CHECK: // %bb.0:
121+
; CHECK-NEXT: ptrue p0.d
122+
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, #1, mul vl]
123+
; CHECK-NEXT: ret
124+
%ptr = getelementptr <vscale x 2 x double>, ptr %a, i64 1
125+
%load = load <vscale x 2 x double>, ptr %ptr, !nontemporal !0
126+
ret <vscale x 2 x double> %load
127+
}
128+
129+
define void @store_nxv16i8(<vscale x 16 x i8> %x, ptr %a) nounwind {
130+
; CHECK-LABEL: store_nxv16i8:
131+
; CHECK: // %bb.0:
132+
; CHECK-NEXT: ptrue p0.b
133+
; CHECK-NEXT: stnt1b { z0.b }, p0, [x0]
134+
; CHECK-NEXT: ret
135+
store <vscale x 16 x i8> %x, ptr %a, !nontemporal !0
136+
ret void
137+
}
138+
139+
define void @store_nxv8i16(<vscale x 8 x i16> %x, ptr %a) nounwind {
140+
; CHECK-LABEL: store_nxv8i16:
141+
; CHECK: // %bb.0:
142+
; CHECK-NEXT: ptrue p0.h
143+
; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
144+
; CHECK-NEXT: ret
145+
store <vscale x 8 x i16> %x, ptr %a, !nontemporal !0
146+
ret void
147+
}
148+
149+
define void @store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
150+
; CHECK-LABEL: store_nxv4i32:
151+
; CHECK: // %bb.0:
152+
; CHECK-NEXT: ptrue p0.s
153+
; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
154+
; CHECK-NEXT: ret
155+
store <vscale x 4 x i32> %x, ptr %a, !nontemporal !0
156+
ret void
157+
}
158+
159+
define void @store_nxv2i64(<vscale x 2 x i64> %x, ptr %a) nounwind {
160+
; CHECK-LABEL: store_nxv2i64:
161+
; CHECK: // %bb.0:
162+
; CHECK-NEXT: ptrue p0.d
163+
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
164+
; CHECK-NEXT: ret
165+
store <vscale x 2 x i64> %x, ptr %a, !nontemporal !0
166+
ret void
167+
}
168+
169+
define void @store_nxv8f16(<vscale x 8 x half> %x, ptr %a) nounwind {
170+
; CHECK-LABEL: store_nxv8f16:
171+
; CHECK: // %bb.0:
172+
; CHECK-NEXT: ptrue p0.h
173+
; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
174+
; CHECK-NEXT: ret
175+
store <vscale x 8 x half> %x, ptr %a, !nontemporal !0
176+
ret void
177+
}
178+
179+
define void @store_nxv8bf16(<vscale x 8 x bfloat> %x, ptr %a) nounwind {
180+
; CHECK-LABEL: store_nxv8bf16:
181+
; CHECK: // %bb.0:
182+
; CHECK-NEXT: ptrue p0.h
183+
; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
184+
; CHECK-NEXT: ret
185+
store <vscale x 8 x bfloat> %x, ptr %a, !nontemporal !0
186+
ret void
187+
}
188+
189+
define void @store_nxv4f32(<vscale x 4 x float> %x, ptr %a) nounwind {
190+
; CHECK-LABEL: store_nxv4f32:
191+
; CHECK: // %bb.0:
192+
; CHECK-NEXT: ptrue p0.s
193+
; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
194+
; CHECK-NEXT: ret
195+
store <vscale x 4 x float> %x, ptr %a, !nontemporal !0
196+
ret void
197+
}
198+
199+
define void @store_nxv2f64(<vscale x 2 x double> %x, ptr %a) nounwind {
200+
; CHECK-LABEL: store_nxv2f64:
201+
; CHECK: // %bb.0:
202+
; CHECK-NEXT: ptrue p0.d
203+
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
204+
; CHECK-NEXT: ret
205+
store <vscale x 2 x double> %x, ptr %a, !nontemporal !0
206+
ret void
207+
}
208+
209+
define void @store_nxv16i8_reg(<vscale x 16 x i8> %x, ptr %a, i64 %off) nounwind {
210+
; CHECK-LABEL: store_nxv16i8_reg:
211+
; CHECK: // %bb.0:
212+
; CHECK-NEXT: ptrue p0.b
213+
; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, x1]
214+
; CHECK-NEXT: ret
215+
%ptr = getelementptr i8, ptr %a, i64 %off
216+
store <vscale x 16 x i8> %x, ptr %ptr, !nontemporal !0
217+
ret void
218+
}
219+
220+
define void @store_nxv16i8_imm(<vscale x 16 x i8> %x, ptr %a) nounwind {
221+
; CHECK-LABEL: store_nxv16i8_imm:
222+
; CHECK: // %bb.0:
223+
; CHECK-NEXT: ptrue p0.b
224+
; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, #1, mul vl]
225+
; CHECK-NEXT: ret
226+
%ptr = getelementptr <vscale x 16 x i8>, ptr %a, i64 1
227+
store <vscale x 16 x i8> %x, ptr %ptr, !nontemporal !0
228+
ret void
229+
}
230+
231+
define void @store_nxv2f64_reg(<vscale x 2 x double> %x, ptr %a, i64 %off) nounwind {
232+
; CHECK-LABEL: store_nxv2f64_reg:
233+
; CHECK: // %bb.0:
234+
; CHECK-NEXT: ptrue p0.d
235+
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, x1, lsl #3]
236+
; CHECK-NEXT: ret
237+
%ptr = getelementptr double, ptr %a, i64 %off
238+
store <vscale x 2 x double> %x, ptr %ptr, !nontemporal !0
239+
ret void
240+
}
241+
242+
define void @store_nxv2f64_imm(<vscale x 2 x double> %x, ptr %a) nounwind {
243+
; CHECK-LABEL: store_nxv2f64_imm:
244+
; CHECK: // %bb.0:
245+
; CHECK-NEXT: ptrue p0.d
246+
; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, #1, mul vl]
247+
; CHECK-NEXT: ret
248+
%ptr = getelementptr <vscale x 2 x double>, ptr %a, i64 1
249+
store <vscale x 2 x double> %x, ptr %ptr, !nontemporal !0
250+
ret void
251+
}
252+
253+
!0 = !{i32 1}
254+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
255+
; CHECK-BE: {{.*}}
256+
; CHECK-LE: {{.*}}

llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,26 @@ define void @masked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a, <vscale x 4 x i
6666
ret void
6767
}
6868

69+
define <vscale x 4 x i32> @all_active_load_nxv4i32(ptr %a) nounwind {
70+
; CHECK-LABEL: all_active_load_nxv4i32:
71+
; CHECK: // %bb.0:
72+
; CHECK-NEXT: ptrue p0.s
73+
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
74+
; CHECK-NEXT: ret
75+
%load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !nontemporal !0
76+
ret <vscale x 4 x i32> %load
77+
}
78+
79+
define void @all_active_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
80+
; CHECK-LABEL: all_active_store_nxv4i32:
81+
; CHECK: // %bb.0:
82+
; CHECK-NEXT: ptrue p0.s
83+
; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
84+
; CHECK-NEXT: ret
85+
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> splat (i1 true)), !nontemporal !0
86+
ret void
87+
}
88+
6989
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
7090
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
7191
declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)

0 commit comments

Comments
 (0)