Skip to content

Commit 4f90e75

Browse files
authored
[AMDGPU] Do not count implicit VGPRs in SIInsertWaitcnts (#109049)
When generating waitcounts before a use or def skip VGPRs. We never have a real implicit VGPR operands on memory instructions, it is only for super-reg liveness accounting. Some other instructions (MOVRELS as an example) may have real implicit VGPR uses though. This is less then ideal but most of the problems observed with spills.
1 parent 8e3cde0 commit 4f90e75

13 files changed

+332
-130
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1752,6 +1752,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17521752
const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
17531753
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
17541754
if (IsVGPR) {
1755+
// Implicit VGPR defs and uses are never a part of the memory
1756+
// instructions description and usually present to account for
1757+
// super-register liveness.
1758+
// TODO: Most of the other instructions also have implicit uses
1759+
// for the liveness accounting only.
1760+
if (Op.isImplicit() && MI.mayLoadOrStore())
1761+
continue;
1762+
17551763
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
17561764
// previous write and this write are the same type of VMEM
17571765
// instruction, in which case they are (in some architectures)

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
187187
; CHECK-NEXT: v_mov_b32_e32 v3, s4
188188
; CHECK-NEXT: ; kill: killed $vgpr4
189189
; CHECK-NEXT: s_xor_saveexec_b32 s4, -1
190-
; CHECK-NEXT: s_waitcnt vmcnt(0)
191190
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
192191
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
193192
; CHECK-NEXT: s_mov_b32 exec_lo, s4

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -778,8 +778,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
778778
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
779779
; GCN-O0-NEXT: s_cbranch_execz .LBB3_7
780780
; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then
781-
; GCN-O0-NEXT: s_waitcnt expcnt(1)
782781
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
782+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
783783
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
784784
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
785785
; GCN-O0-NEXT: s_mov_b32 s2, 0
@@ -824,8 +824,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
824824
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
825825
; GCN-O0-NEXT: s_cbranch_execz .LBB3_6
826826
; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2
827-
; GCN-O0-NEXT: s_waitcnt expcnt(1)
828827
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
828+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
829829
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
830830
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
831831
; GCN-O0-NEXT: s_mov_b32 s2, 0
@@ -1242,10 +1242,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12421242
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
12431243
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
12441244
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1245-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1245+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12461246
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
1247+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12471248
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
1249+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12481250
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
1251+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12491252
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
12501253
; GCN-O0-NEXT: s_branch .LBB5_7
12511254
; GCN-O0-NEXT: .LBB5_6: ; %Flow
@@ -1263,10 +1266,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12631266
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
12641267
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
12651268
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1266-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1269+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12671270
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1271+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12681272
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
1273+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12691274
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
1275+
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
12701276
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
12711277
; GCN-O0-NEXT: s_branch .LBB5_5
12721278
; GCN-O0-NEXT: .LBB5_7: ; %bb10
@@ -1336,10 +1342,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13361342
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
13371343
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
13381344
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1339-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1345+
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
13401346
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
1347+
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
13411348
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
1349+
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
13421350
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
1351+
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
13431352
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
13441353
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
13451354
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
@@ -1356,9 +1365,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13561365
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
13571366
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
13581367
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1359-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1368+
; GCN-O0-NEXT: s_waitcnt expcnt(2)
13601369
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
1370+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
13611371
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
1372+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
13621373
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
13631374
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
13641375
; GCN-O0-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)