Skip to content

Commit a779af3

Browse files
authored
[AMDGPU] Change SGPR layout to striped caller/callee saved (#127353)
This PR updates the SGPR layout to a striped caller/callee-saved design, similar to the VGPR layout. To ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer), and s34 (base pointer) remain callee-saved, the striped layout starts from s40, with a stripe width of 8. The last stripe is 10 wide instead of 8 to avoid ending with a 2-wide stripe. Fixes #113782.
1 parent e86c5a7 commit a779af3

File tree

57 files changed

+9334
-13577
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+9334
-13577
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td

+9-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,15 @@ def CSR_AMDGPU_AGPRs : CalleeSavedRegs<
9191
>;
9292

9393
def CSR_AMDGPU_SGPRs : CalleeSavedRegs<
94-
(sequence "SGPR%u", 30, 105)
94+
// Ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer),
95+
// and s34 (base pointer) are callee-saved. The striped layout starts from s40,
96+
// with a stripe width of 8. The last stripe is 10 wide instead of 8, to avoid
97+
// ending with a 2-wide stripe.
98+
(add (sequence "SGPR%u", 30, 39),
99+
(sequence "SGPR%u", 48, 55),
100+
(sequence "SGPR%u", 64, 71),
101+
(sequence "SGPR%u", 80, 87),
102+
(sequence "SGPR%u", 96, 105))
95103
>;
96104

97105
def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs<

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

+115-115
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/bf16.ll

+92-243
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll

+21-21
Original file line numberDiff line numberDiff line change
@@ -9,41 +9,41 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
99
; CHECK-NEXT: s_addc_u32 s13, s13, 0
1010
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
1111
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
12-
; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0
12+
; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0
1313
; CHECK-NEXT: s_add_u32 s0, s0, s17
1414
; CHECK-NEXT: s_addc_u32 s1, s1, 0
1515
; CHECK-NEXT: s_mov_b32 s12, 0
1616
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
17-
; CHECK-NEXT: s_cmp_lg_u32 s40, 0
17+
; CHECK-NEXT: s_cmp_lg_u32 s52, 0
1818
; CHECK-NEXT: s_cbranch_scc1 .LBB0_8
1919
; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i
20-
; CHECK-NEXT: s_cmp_eq_u32 s42, 0
20+
; CHECK-NEXT: s_cmp_eq_u32 s54, 0
2121
; CHECK-NEXT: s_cbranch_scc1 .LBB0_4
2222
; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i
23-
; CHECK-NEXT: s_cmp_lg_u32 s43, 0
23+
; CHECK-NEXT: s_cmp_lg_u32 s55, 0
2424
; CHECK-NEXT: s_mov_b32 s17, 0
2525
; CHECK-NEXT: s_cselect_b32 s12, -1, 0
2626
; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
2727
; CHECK-NEXT: s_cbranch_vccz .LBB0_5
2828
; CHECK-NEXT: ; %bb.3:
29-
; CHECK-NEXT: s_mov_b32 s36, 0
29+
; CHECK-NEXT: s_mov_b32 s48, 0
3030
; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
3131
; CHECK-NEXT: s_cbranch_vccz .LBB0_6
3232
; CHECK-NEXT: s_branch .LBB0_7
3333
; CHECK-NEXT: .LBB0_4:
3434
; CHECK-NEXT: s_mov_b32 s14, s12
3535
; CHECK-NEXT: s_mov_b32 s15, s12
3636
; CHECK-NEXT: s_mov_b32 s13, s12
37-
; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15]
38-
; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13]
37+
; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15]
38+
; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13]
3939
; CHECK-NEXT: s_branch .LBB0_7
4040
; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i
41-
; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0
42-
; CHECK-NEXT: s_mov_b32 s36, 1.0
41+
; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0
42+
; CHECK-NEXT: s_mov_b32 s48, 1.0
4343
; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
44-
; CHECK-NEXT: s_mov_b32 s37, s36
45-
; CHECK-NEXT: s_mov_b32 s38, s36
46-
; CHECK-NEXT: s_mov_b32 s39, s36
44+
; CHECK-NEXT: s_mov_b32 s49, s48
45+
; CHECK-NEXT: s_mov_b32 s50, s48
46+
; CHECK-NEXT: s_mov_b32 s51, s48
4747
; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
4848
; CHECK-NEXT: s_cbranch_vccnz .LBB0_7
4949
; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i
@@ -55,7 +55,7 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
5555
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5656
; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0
5757
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1
58-
; CHECK-NEXT: v_add_f32_e64 v1, s17, s36
58+
; CHECK-NEXT: v_add_f32_e64 v1, s17, s48
5959
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
6060
; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13]
6161
; CHECK-NEXT: s_mov_b32 s12, s14
@@ -65,13 +65,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
6565
; CHECK-NEXT: v_mov_b32_e32 v2, 0
6666
; CHECK-NEXT: s_mov_b32 s13, s15
6767
; CHECK-NEXT: s_mov_b32 s14, s16
68-
; CHECK-NEXT: s_mov_b32 s36, 0
68+
; CHECK-NEXT: s_mov_b32 s48, 0
6969
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
7070
; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
7171
; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35]
72-
; CHECK-NEXT: s_mov_b32 s37, s36
73-
; CHECK-NEXT: s_mov_b32 s38, s36
74-
; CHECK-NEXT: s_mov_b32 s39, s36
72+
; CHECK-NEXT: s_mov_b32 s49, s48
73+
; CHECK-NEXT: s_mov_b32 s50, s48
74+
; CHECK-NEXT: s_mov_b32 s51, s48
7575
; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i
7676
; CHECK-NEXT: v_mov_b32_e32 v0, 0
7777
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
@@ -80,11 +80,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
8080
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
8181
; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit
8282
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20
83-
; CHECK-NEXT: v_mov_b32_e32 v0, s36
83+
; CHECK-NEXT: v_mov_b32_e32 v0, s48
8484
; CHECK-NEXT: v_mov_b32_e32 v4, 0
85-
; CHECK-NEXT: v_mov_b32_e32 v1, s37
86-
; CHECK-NEXT: v_mov_b32_e32 v2, s38
87-
; CHECK-NEXT: v_mov_b32_e32 v3, s39
85+
; CHECK-NEXT: v_mov_b32_e32 v1, s49
86+
; CHECK-NEXT: v_mov_b32_e32 v2, s50
87+
; CHECK-NEXT: v_mov_b32_e32 v3, s51
8888
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
8989
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
9090
; CHECK-NEXT: s_endpgm

0 commit comments

Comments
 (0)