Skip to content

Commit 96c3207

Browse files
authored
[NVTPX] Copy kernel arguments as byte array (#110356)
Ensures that struct padding is not skipped, as it may contain actual data if the struct is really a union. The patch originated from a discussion on #53710 Fixes #53710
1 parent 0eaccee commit 96c3207

File tree

4 files changed

+66
-57
lines changed

4 files changed

+66
-57
lines changed

llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -626,10 +626,10 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
626626
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
627627
// addrspacecast preserves alignment. Since params are constant, this load
628628
// is definitely not volatile.
629-
LoadInst *LI =
630-
new LoadInst(StructType, ArgInParam, Arg->getName(),
631-
/*isVolatile=*/false, AllocA->getAlign(), FirstInst);
632-
new StoreInst(LI, AllocA, FirstInst);
629+
const auto ArgSize = *AllocA->getAllocationSize(DL);
630+
IRBuilder<> IRB(&*FirstInst);
631+
IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),
632+
ArgSize);
633633
}
634634
}
635635

llvm/test/CodeGen/NVPTX/lower-args.ll

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@ target triple = "nvptx64-nvidia-cuda"
99

1010
%class.outer = type <{ %class.inner, i32, [4 x i8] }>
1111
%class.inner = type { ptr, ptr }
12+
%class.padded = type { i8, i32 }
1213

1314
; Check that nvptx-lower-args preserves arg alignment
1415
; COMMON-LABEL: load_alignment
1516
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
1617
entry:
17-
; IR: load %class.outer, ptr addrspace(101)
18-
; IR-SAME: align 8
18+
; IR: call void @llvm.memcpy.p0.p101.i64(ptr align 8
1919
; PTX: ld.param.u64
2020
; PTX-NOT: ld.param.u8
2121
%arg.idx.val = load ptr, ptr %arg, align 8
@@ -33,6 +33,36 @@ entry:
3333
ret void
3434
}
3535

36+
; Check that nvptx-lower-args copies padding as the struct may have been a union
37+
; COMMON-LABEL: load_padding
38+
define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
39+
; PTX: {
40+
; PTX-NEXT: .local .align 8 .b8 __local_depot1[8];
41+
; PTX-NEXT: .reg .b64 %SP;
42+
; PTX-NEXT: .reg .b64 %SPL;
43+
; PTX-NEXT: .reg .b64 %rd<5>;
44+
; PTX-EMPTY:
45+
; PTX-NEXT: // %bb.0:
46+
; PTX-NEXT: mov.u64 %SPL, __local_depot1;
47+
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
48+
; PTX-NEXT: ld.param.u64 %rd1, [load_padding_param_0];
49+
; PTX-NEXT: st.u64 [%SP+0], %rd1;
50+
; PTX-NEXT: add.u64 %rd2, %SP, 0;
51+
; PTX-NEXT: { // callseq 1, 0
52+
; PTX-NEXT: .param .b64 param0;
53+
; PTX-NEXT: st.param.b64 [param0+0], %rd2;
54+
; PTX-NEXT: .param .b64 retval0;
55+
; PTX-NEXT: call.uni (retval0),
56+
; PTX-NEXT: escape,
57+
; PTX-NEXT: (
58+
; PTX-NEXT: param0
59+
; PTX-NEXT: );
60+
; PTX-NEXT: ld.param.b64 %rd3, [retval0+0];
61+
; PTX-NEXT: } // callseq 1
62+
; PTX-NEXT: ret;
63+
%tmp = call ptr @escape(ptr nonnull align 16 %arg)
64+
ret void
65+
}
3666

3767
; COMMON-LABEL: ptr_generic
3868
define void @ptr_generic(ptr %out, ptr %in) {

llvm/test/CodeGen/NVPTX/lower-byval-args.ll

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
8888
; COMMON-NEXT: [[ENTRY:.*:]]
8989
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
9090
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
91-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
92-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
91+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
9392
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
9493
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
9594
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -115,8 +114,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
115114
; COMMON-NEXT: [[ENTRY:.*:]]
116115
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
117116
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
118-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
119-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
117+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
120118
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
121119
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
122120
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
@@ -134,8 +132,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
134132
; COMMON-NEXT: [[ENTRY:.*:]]
135133
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
136134
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
137-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
138-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
135+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
139136
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
140137
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
141138
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -155,8 +152,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
155152
; COMMON-NEXT: [[ENTRY:.*:]]
156153
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
157154
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
158-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
159-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
155+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
160156
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
161157
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
162158
; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
@@ -174,8 +170,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
174170
; COMMON-NEXT: [[ENTRY:.*:]]
175171
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
176172
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
177-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
178-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
173+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
179174
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
180175
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
181176
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -195,8 +190,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
195190
; COMMON-NEXT: [[ENTRY:.*:]]
196191
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
197192
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
198-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
199-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
193+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
200194
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
201195
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
202196
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
@@ -232,8 +226,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
232226
; COMMON-NEXT: [[ENTRY:.*:]]
233227
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
234228
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
235-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
236-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
229+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
237230
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
238231
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
239232
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
@@ -251,8 +244,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
251244
; COMMON-NEXT: [[BB:.*:]]
252245
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
253246
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
254-
; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
255-
; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
247+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
256248
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
257249
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
258250
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
@@ -273,12 +265,10 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
273265
; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
274266
; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
275267
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
276-
; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
277-
; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
268+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
278269
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
279270
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
280-
; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
281-
; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
271+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
282272
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
283273
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
284274
; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4
@@ -313,12 +303,10 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
313303
; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
314304
; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
315305
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
316-
; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
317-
; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
306+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
318307
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
319308
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
320-
; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
321-
; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
309+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
322310
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
323311
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
324312
; COMMON-NEXT: ret void
@@ -337,12 +325,10 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
337325
; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr
338326
; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
339327
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
340-
; SM_60-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
341-
; SM_60-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
328+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
342329
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
343330
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
344-
; SM_60-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
345-
; SM_60-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
331+
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
346332
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
347333
; SM_60: [[FIRST]]:
348334
; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
@@ -402,12 +388,10 @@ define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%str
402388
; COMMON-NEXT: [[BB:.*:]]
403389
; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
404390
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
405-
; COMMON-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
406-
; COMMON-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
391+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
407392
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
408393
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
409-
; COMMON-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
410-
; COMMON-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
394+
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
411395
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
412396
; COMMON: [[FIRST]]:
413397
; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0

llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,43 +9,38 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
99
; CHECK-NEXT: .local .align 8 .b8 __local_depot0[32];
1010
; CHECK-NEXT: .reg .b32 %SP;
1111
; CHECK-NEXT: .reg .b32 %SPL;
12-
; CHECK-NEXT: .reg .b32 %r<4>;
13-
; CHECK-NEXT: .reg .b64 %rd<17>;
12+
; CHECK-NEXT: .reg .b32 %r<2>;
13+
; CHECK-NEXT: .reg .b64 %rd<13>;
1414
; CHECK-EMPTY:
1515
; CHECK-NEXT: // %bb.0:
1616
; CHECK-NEXT: mov.u32 %SPL, __local_depot0;
1717
; CHECK-NEXT: cvta.local.u32 %SP, %SPL;
1818
; CHECK-NEXT: ld.param.u32 %r1, [caller_St8x4_param_1];
19-
; CHECK-NEXT: add.u32 %r3, %SPL, 0;
2019
; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+24];
20+
; CHECK-NEXT: st.u64 [%SP+24], %rd1;
2121
; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0+16];
22+
; CHECK-NEXT: st.u64 [%SP+16], %rd2;
2223
; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0+8];
24+
; CHECK-NEXT: st.u64 [%SP+8], %rd3;
2325
; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0];
24-
; CHECK-NEXT: st.local.u64 [%r3], %rd4;
25-
; CHECK-NEXT: st.local.u64 [%r3+8], %rd3;
26-
; CHECK-NEXT: st.local.u64 [%r3+16], %rd2;
27-
; CHECK-NEXT: st.local.u64 [%r3+24], %rd1;
28-
; CHECK-NEXT: ld.u64 %rd5, [%SP+8];
29-
; CHECK-NEXT: ld.u64 %rd6, [%SP+0];
30-
; CHECK-NEXT: ld.u64 %rd7, [%SP+24];
31-
; CHECK-NEXT: ld.u64 %rd8, [%SP+16];
26+
; CHECK-NEXT: st.u64 [%SP+0], %rd4;
3227
; CHECK-NEXT: { // callseq 0, 0
3328
; CHECK-NEXT: .param .align 16 .b8 param0[32];
34-
; CHECK-NEXT: st.param.v2.b64 [param0+0], {%rd6, %rd5};
35-
; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd8, %rd7};
29+
; CHECK-NEXT: st.param.v2.b64 [param0+0], {%rd4, %rd3};
30+
; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd2, %rd1};
3631
; CHECK-NEXT: .param .align 16 .b8 retval0[32];
3732
; CHECK-NEXT: call.uni (retval0),
3833
; CHECK-NEXT: callee_St8x4,
3934
; CHECK-NEXT: (
4035
; CHECK-NEXT: param0
4136
; CHECK-NEXT: );
42-
; CHECK-NEXT: ld.param.v2.b64 {%rd9, %rd10}, [retval0+0];
43-
; CHECK-NEXT: ld.param.v2.b64 {%rd11, %rd12}, [retval0+16];
37+
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0+0];
38+
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16];
4439
; CHECK-NEXT: } // callseq 0
45-
; CHECK-NEXT: st.u64 [%r1], %rd9;
46-
; CHECK-NEXT: st.u64 [%r1+8], %rd10;
47-
; CHECK-NEXT: st.u64 [%r1+16], %rd11;
48-
; CHECK-NEXT: st.u64 [%r1+24], %rd12;
40+
; CHECK-NEXT: st.u64 [%r1], %rd5;
41+
; CHECK-NEXT: st.u64 [%r1+8], %rd6;
42+
; CHECK-NEXT: st.u64 [%r1+16], %rd7;
43+
; CHECK-NEXT: st.u64 [%r1+24], %rd8;
4944
; CHECK-NEXT: ret;
5045
%call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
5146
%.fca.0.extract = extractvalue [4 x i64] %call, 0

0 commit comments

Comments
 (0)