Skip to content

Commit 0065c31

Browse files
committed
[SelectionDAG] Use unaligned store to move AVX registers onto stack for extractelement
Prior to this patch, SelectionDAG generated aligned move onto stacks for AVX registers when the function was marked as a no-realign-stack function. This lead to misalignment between the stack and the instruction generated. This patch fixes the issue. Fixes #77730
1 parent 7f1d757 commit 0065c31

File tree

3 files changed

+80
-37
lines changed

3 files changed

+80
-37
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/Analysis/ConstantFolding.h"
2222
#include "llvm/Analysis/TargetLibraryInfo.h"
2323
#include "llvm/CodeGen/ISDOpcodes.h"
24+
#include "llvm/CodeGen/MachineFrameInfo.h"
2425
#include "llvm/CodeGen/MachineFunction.h"
2526
#include "llvm/CodeGen/MachineJumpTableInfo.h"
2627
#include "llvm/CodeGen/MachineMemOperand.h"
@@ -1377,6 +1378,20 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
13771378
}
13781379
}
13791380

1381+
// Helper function that generates an MMO that considers the alignment of the
1382+
// stack, and the size of the stack object
1383+
static MachineMemOperand *getStackAlignedMMO(SDValue StackPtr,
1384+
MachineFunction &MF) {
1385+
auto &MFI = MF.getFrameInfo();
1386+
int FI = cast<FrameIndexSDNode>(StackPtr)->getIndex();
1387+
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
1388+
MachineMemOperand *MMO =
1389+
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
1390+
MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
1391+
1392+
return MMO;
1393+
}
1394+
13801395
SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
13811396
SDValue Vec = Op.getOperand(0);
13821397
SDValue Idx = Op.getOperand(1);
@@ -1426,8 +1441,9 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
14261441
if (!Ch.getNode()) {
14271442
// Store the value to a temporary stack slot, then LOAD the returned part.
14281443
StackPtr = DAG.CreateStackTemporary(VecVT);
1429-
Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
1430-
MachinePointerInfo());
1444+
MachineMemOperand *StoreMMO =
1445+
getStackAlignedMMO(StackPtr, DAG.getMachineFunction());
1446+
Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, StoreMMO);
14311447
}
14321448

14331449
SDValue NewLoad;

llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll

Lines changed: 42 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK
33

44
; Should codegen to a nop, since idx is zero.
@@ -84,14 +84,15 @@ define <4 x i32> @extract_v4i32_nxv2i32_idx4(<vscale x 2 x i32> %vec) nounwind #
8484
; CHECK: // %bb.0:
8585
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
8686
; CHECK-NEXT: addvl sp, sp, #-1
87-
; CHECK-NEXT: ptrue p0.d
87+
; CHECK-NEXT: ptrue p0.d, vl4
8888
; CHECK-NEXT: mov x8, #4 // =0x4
8989
; CHECK-NEXT: mov x9, sp
90-
; CHECK-NEXT: ptrue p1.d, vl4
91-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
92-
; CHECK-NEXT: ld1d { z0.d }, p1/z, [x9, x8, lsl #3]
93-
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
94-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
90+
; CHECK-NEXT: mov z2.d, z0.d
91+
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9, x8, lsl #3]
92+
; CHECK-NEXT: ptrue p0.d
93+
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
94+
; CHECK-NEXT: st1d { z2.d }, p0, [sp]
95+
; CHECK-NEXT: mov v0.16b, v1.16b
9596
; CHECK-NEXT: addvl sp, sp, #1
9697
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
9798
; CHECK-NEXT: ret
@@ -149,14 +150,15 @@ define <8 x i16> @extract_v8i16_nxv4i16_idx8(<vscale x 4 x i16> %vec) nounwind #
149150
; CHECK: // %bb.0:
150151
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
151152
; CHECK-NEXT: addvl sp, sp, #-1
152-
; CHECK-NEXT: ptrue p0.s
153+
; CHECK-NEXT: ptrue p0.s, vl8
153154
; CHECK-NEXT: mov x8, #8 // =0x8
154155
; CHECK-NEXT: mov x9, sp
155-
; CHECK-NEXT: ptrue p1.s, vl8
156-
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
157-
; CHECK-NEXT: ld1w { z0.s }, p1/z, [x9, x8, lsl #2]
158-
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
159-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
156+
; CHECK-NEXT: mov z2.d, z0.d
157+
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9, x8, lsl #2]
158+
; CHECK-NEXT: ptrue p0.s
159+
; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
160+
; CHECK-NEXT: st1w { z2.s }, p0, [sp]
161+
; CHECK-NEXT: mov v0.16b, v1.16b
160162
; CHECK-NEXT: addvl sp, sp, #1
161163
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
162164
; CHECK-NEXT: ret
@@ -182,15 +184,16 @@ define <8 x i16> @extract_v8i16_nxv2i16_idx8(<vscale x 2 x i16> %vec) nounwind #
182184
; CHECK: // %bb.0:
183185
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
184186
; CHECK-NEXT: addvl sp, sp, #-1
185-
; CHECK-NEXT: ptrue p0.d
187+
; CHECK-NEXT: ptrue p0.d, vl8
186188
; CHECK-NEXT: mov x8, #8 // =0x8
187189
; CHECK-NEXT: mov x9, sp
188-
; CHECK-NEXT: ptrue p1.d, vl8
189-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
190-
; CHECK-NEXT: ld1d { z0.d }, p1/z, [x9, x8, lsl #3]
191-
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
192-
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
193-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
190+
; CHECK-NEXT: mov z2.d, z0.d
191+
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9, x8, lsl #3]
192+
; CHECK-NEXT: ptrue p0.d
193+
; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s
194+
; CHECK-NEXT: st1d { z2.d }, p0, [sp]
195+
; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
196+
; CHECK-NEXT: mov v0.16b, v1.16b
194197
; CHECK-NEXT: addvl sp, sp, #1
195198
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
196199
; CHECK-NEXT: ret
@@ -247,14 +250,15 @@ define <16 x i8> @extract_v16i8_nxv8i8_idx16(<vscale x 8 x i8> %vec) nounwind #1
247250
; CHECK: // %bb.0:
248251
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
249252
; CHECK-NEXT: addvl sp, sp, #-1
250-
; CHECK-NEXT: ptrue p0.h
253+
; CHECK-NEXT: ptrue p0.h, vl16
251254
; CHECK-NEXT: mov x8, #16 // =0x10
252255
; CHECK-NEXT: mov x9, sp
253-
; CHECK-NEXT: ptrue p1.h, vl16
254-
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
255-
; CHECK-NEXT: ld1h { z0.h }, p1/z, [x9, x8, lsl #1]
256-
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
257-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
256+
; CHECK-NEXT: mov z2.d, z0.d
257+
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x9, x8, lsl #1]
258+
; CHECK-NEXT: ptrue p0.h
259+
; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b
260+
; CHECK-NEXT: st1h { z2.h }, p0, [sp]
261+
; CHECK-NEXT: mov v0.16b, v1.16b
258262
; CHECK-NEXT: addvl sp, sp, #1
259263
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
260264
; CHECK-NEXT: ret
@@ -280,15 +284,16 @@ define <16 x i8> @extract_v16i8_nxv4i8_idx16(<vscale x 4 x i8> %vec) nounwind #1
280284
; CHECK: // %bb.0:
281285
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
282286
; CHECK-NEXT: addvl sp, sp, #-1
283-
; CHECK-NEXT: ptrue p0.s
287+
; CHECK-NEXT: ptrue p0.s, vl16
284288
; CHECK-NEXT: mov x8, #16 // =0x10
285289
; CHECK-NEXT: mov x9, sp
286-
; CHECK-NEXT: ptrue p1.s, vl16
287-
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
288-
; CHECK-NEXT: ld1w { z0.s }, p1/z, [x9, x8, lsl #2]
289-
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
290-
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
291-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
290+
; CHECK-NEXT: mov z2.d, z0.d
291+
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9, x8, lsl #2]
292+
; CHECK-NEXT: ptrue p0.s
293+
; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
294+
; CHECK-NEXT: st1w { z2.s }, p0, [sp]
295+
; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b
296+
; CHECK-NEXT: mov v0.16b, v1.16b
292297
; CHECK-NEXT: addvl sp, sp, #1
293298
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
294299
; CHECK-NEXT: ret
@@ -437,8 +442,10 @@ define <2 x i64> @extract_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind
437442
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
438443
; CHECK-NEXT: addvl sp, sp, #-1
439444
; CHECK-NEXT: ptrue p0.d
440-
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
441-
; CHECK-NEXT: ldr q0, [sp, #16]
445+
; CHECK-NEXT: mov z2.d, z0.d
446+
; CHECK-NEXT: ldr q1, [sp, #16]
447+
; CHECK-NEXT: mov v0.16b, v1.16b
448+
; CHECK-NEXT: st1d { z2.d }, p0, [sp]
442449
; CHECK-NEXT: addvl sp, sp, #1
443450
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
444451
; CHECK-NEXT: ret
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
3+
4+
define i32 @foo(i32 %arg1) #0 {
5+
; CHECK-LABEL: foo:
6+
; CHECK: # %bb.0: # %entry
7+
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
8+
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
9+
; CHECK-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
10+
; CHECK-NEXT: andl $31, %edi
11+
; CHECK-NEXT: movzbl -40(%rsp,%rdi), %eax
12+
; CHECK-NEXT: vzeroupper
13+
; CHECK-NEXT: retq
14+
entry:
15+
%a = extractelement <32 x i8> zeroinitializer, i32 %arg1
16+
%b = zext i8 %a to i32
17+
ret i32 %b
18+
}
19+
20+
attributes #0 = { "no-realign-stack" "target-cpu"="skylake-avx512" }

0 commit comments

Comments
 (0)