Skip to content
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1752,6 +1752,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (IsVGPR) {
// Implicit VGPR defs and uses are never a part of the memory
// instructions description and usually present to account for
// super-register liveness.
// TODO: Most of the other instructions also have implicit uses
// for the liveness accounting only.
Comment on lines +1758 to +1759
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

COPY and the MOVs are the most common cases

if (Op.isImplicit() && MI.mayLoadOrStore())
continue;

// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
// instruction, in which case they are (in some architectures)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: ; kill: killed $vgpr4
; CHECK-NEXT: s_xor_saveexec_b32 s4, -1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 exec_lo, s4
Expand Down
23 changes: 17 additions & 6 deletions llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -778,8 +778,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_7
; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
Expand Down Expand Up @@ -824,8 +824,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_6
; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
Expand Down Expand Up @@ -1242,10 +1242,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_7
; GCN-O0-NEXT: .LBB5_6: ; %Flow
Expand All @@ -1263,10 +1266,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(3)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_branch .LBB5_5
; GCN-O0-NEXT: .LBB5_7: ; %bb10
Expand Down Expand Up @@ -1336,10 +1342,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
Expand All @@ -1356,9 +1365,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: s_waitcnt expcnt(2)
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt expcnt(1)
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
Expand Down
Loading