diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index a566827260138..80a7529002ac9 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1752,6 +1752,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (IsVGPR) { + // Implicit VGPR defs and uses are never a part of the memory + // instructions description and usually present to account for + // super-register liveness. + // TODO: Most of the other instructions also have implicit uses + // for the liveness accounting only. + if (Op.isImplicit() && MI.mayLoadOrStore()) + continue; + // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the // previous write and this write are the same type of VMEM // instruction, in which case they are (in some architectures) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index 10cbc56cc5fbe..de973481f8230 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -187,7 +187,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ; kill: killed $vgpr4 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 51f9cf73488ee..67a084068941a 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -778,8 +778,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 ; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then -; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 @@ -824,8 +824,8 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 ; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 -; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 @@ -1242,10 +1242,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow @@ -1263,10 +1266,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(3) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 @@ -1336,10 +1342,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(4) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 @@ -1356,9 +1365,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_waitcnt expcnt(2) ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 712cecff40617..b541be9f5aa44 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -570,21 +570,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 @@ -599,9 +599,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -663,9 +663,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -1718,17 +1718,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_5 ; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 @@ -1743,11 +1747,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB0_9 ; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit @@ -1822,11 +1828,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 @@ -2787,21 +2795,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 @@ -2816,9 +2824,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2880,9 +2888,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -3846,17 +3854,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_5 ; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 @@ -3871,11 +3883,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_branch .LBB1_9 ; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit @@ -3950,11 +3964,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-G-O0-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index b4fe112438b4f..60946956547a7 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -4111,7 +4111,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill @@ -4137,7 +4136,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload @@ -4146,12 +4144,19 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) ; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) ; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) ; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) ; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload @@ -4178,7 +4183,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill @@ -4597,7 +4601,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1 ; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[16:17] -; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill @@ -4623,7 +4626,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload @@ -4632,12 +4634,19 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) ; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) ; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) ; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) ; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload @@ -4664,7 +4673,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill @@ -5912,7 +5920,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 7 ; NOOPT-NEXT: v_readlane_b32 s1, v0, 8 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload @@ -5921,12 +5928,19 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) ; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) ; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) ; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) ; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload @@ -5953,7 +5967,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill @@ -6041,7 +6054,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 11 ; NOOPT-NEXT: v_readlane_b32 s1, v0, 12 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload @@ -6050,12 +6062,19 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) ; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) ; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) ; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) ; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload @@ -6082,7 +6101,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill @@ -9175,7 +9193,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 6 ; NOOPT-NEXT: v_readlane_b32 s1, v0, 7 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload @@ -9184,12 +9201,19 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) ; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) ; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) ; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) ; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload @@ -9216,7 +9240,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) { ; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill @@ -9641,7 +9664,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1 ; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill ; NOOPT-NEXT: s_mov_b64 exec, s[12:13] -; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill @@ -9667,7 +9689,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: s_waitcnt vmcnt(0) ; NOOPT-NEXT: v_readlane_b32 s0, v0, 9 ; NOOPT-NEXT: v_readlane_b32 s1, v0, 10 -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload @@ -9676,12 +9697,19 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(6) ; NOOPT-NEXT: buffer_load_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(5) ; NOOPT-NEXT: buffer_load_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(4) ; NOOPT-NEXT: buffer_load_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(3) ; NOOPT-NEXT: buffer_load_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(2) ; NOOPT-NEXT: buffer_load_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(1) ; NOOPT-NEXT: buffer_load_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload +; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload ; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload @@ -9708,7 +9736,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace ; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill -; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill ; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index fe5427048e8cf..e0c2d00891250 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3569,7 +3569,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -3577,7 +3576,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -4382,7 +4380,6 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -7350,12 +7347,15 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 @@ -7378,7 +7378,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 5ae2b91bdb3e7..4d7f1a9663c3d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -3098,7 +3098,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll index 85d342bf303c0..c302233e748fd 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -244,7 +244,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 { ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 42ed4c1f2e63d..dd6fd5aa384f6 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -258,7 +258,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 { ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index fd6e06afc67da..19cc60963e900 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -609,21 +609,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 @@ -638,9 +638,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -702,9 +702,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2007,21 +2007,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(7) ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 @@ -2036,9 +2036,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -2100,9 +2100,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-wait.mir b/llvm/test/CodeGen/AMDGPU/spill-wait.mir index 8e896252af89b..6983a2742a41c 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-wait.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-wait.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GFX12 %s # There shall be no S_WAITCNT between two stores. @@ -10,14 +11,27 @@ body: | bb.0: liveins: $vgpr0_vgpr1, $sgpr76_sgpr77_sgpr78_sgpr79 - ; GCN-LABEL: name: spill_vgpr_tuple - ; GCN: liveins: $vgpr0_vgpr1, $sgpr76_sgpr77_sgpr78_sgpr79 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $vgpr64_vgpr65 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr64, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 672, 0, 0, implicit $exec, implicit-def $vgpr64_vgpr65, implicit $vgpr64_vgpr65 - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr65, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 676, 0, 0, implicit $exec, implicit $vgpr64_vgpr65 - ; GCN-NEXT: S_ENDPGM 0 + ; GFX9-LABEL: name: spill_vgpr_tuple + ; GFX9: liveins: $vgpr0_vgpr1, $sgpr76_sgpr77_sgpr78_sgpr79 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr64_vgpr65 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr64, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 672, 0, 0, implicit $exec, implicit-def $vgpr64_vgpr65, implicit $vgpr64_vgpr65 + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr65, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 676, 0, 0, implicit $exec, implicit $vgpr64_vgpr65 + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: spill_vgpr_tuple + ; GFX12: liveins: $vgpr0_vgpr1, $sgpr76_sgpr77_sgpr78_sgpr79 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr64_vgpr65 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec + ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr64, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 672, 0, 0, implicit $exec, implicit-def $vgpr64_vgpr65, implicit $vgpr64_vgpr65 + ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr65, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 676, 0, 0, implicit $exec, implicit $vgpr64_vgpr65 + ; GFX12-NEXT: S_ENDPGM 0 $vgpr64_vgpr65 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec BUFFER_STORE_DWORD_OFFSET killed $vgpr64, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 672, 0, 0, implicit $exec, implicit-def $vgpr64_vgpr65, implicit $vgpr64_vgpr65 BUFFER_STORE_DWORD_OFFSET $vgpr65, $sgpr76_sgpr77_sgpr78_sgpr79, 0, 676, 0, 0, implicit $exec, implicit $vgpr64_vgpr65 @@ -33,14 +47,27 @@ body: | bb.0: liveins: $vgpr0, $sgpr10_sgpr11 - ; GCN-LABEL: name: load_vcc_wait - ; GCN: liveins: $vgpr0, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $vcc_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: $vgpr1 = V_ADDC_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX9-LABEL: name: load_vcc_wait + ; GFX9: liveins: $vgpr0, $sgpr10_sgpr11 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vcc_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GFX9-NEXT: S_WAITCNT 49279 + ; GFX9-NEXT: $vgpr1 = V_ADDC_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: load_vcc_wait + ; GFX12: liveins: $vgpr0, $sgpr10_sgpr11 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vcc_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr1 = V_ADDC_U32_e32 0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 $vcc_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 $vgpr1 = V_ADDC_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec S_ENDPGM 0 @@ -55,14 +82,27 @@ body: | bb.0: liveins: $sgpr10_sgpr11, $vgpr0_vgpr1 - ; GCN-LABEL: name: load_flat_scr_lo_flat_load_wait - ; GCN: liveins: $sgpr10_sgpr11, $vgpr0_vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $flat_scr_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: S_ENDPGM 0 + ; GFX9-LABEL: name: load_flat_scr_lo_flat_load_wait + ; GFX9: liveins: $sgpr10_sgpr11, $vgpr0_vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $flat_scr_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GFX9-NEXT: S_WAITCNT 49279 + ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: load_flat_scr_lo_flat_load_wait + ; GFX12: liveins: $sgpr10_sgpr11, $vgpr0_vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $flat_scr_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX12-NEXT: S_ENDPGM 0 $flat_scr_lo = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -75,15 +115,120 @@ body: | bb.0: liveins: $sgpr10_sgpr11, $vgpr0, $sgpr32 - ; GCN-LABEL: name: load_flat_scr_lo_scratch_store_wait - ; GCN: liveins: $sgpr10_sgpr11, $vgpr0, $sgpr32 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_WAITCNT 0 - ; GCN-NEXT: $flat_scr_hi = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: S_ENDPGM 0 + ; GFX9-LABEL: name: load_flat_scr_lo_scratch_store_wait + ; GFX9: liveins: $sgpr10_sgpr11, $vgpr0, $sgpr32 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $flat_scr_hi = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GFX9-NEXT: S_WAITCNT 49279 + ; GFX9-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: load_flat_scr_lo_scratch_store_wait + ; GFX12: liveins: $sgpr10_sgpr11, $vgpr0, $sgpr32 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $flat_scr_hi = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr + ; GFX12-NEXT: S_ENDPGM 0 $flat_scr_hi = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0 SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... + +# Check that implicit spill defs do not force wait to zero on the first store + +--- +name: spill_load_store + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32 + + ; GFX9-LABEL: name: spill_load_store + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: S_WAITCNT 3955 + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: S_WAITCNT 3955 + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec + ; GFX9-NEXT: S_WAITCNT 3955 + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec + ; GFX9-NEXT: S_WAITCNT 3955 + ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: spill_load_store + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX12-NEXT: S_WAIT_LOADCNT 3 + ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX12-NEXT: S_WAIT_LOADCNT 2 + ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_LOADCNT 1 + ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_LOADCNT 0 + ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + S_ENDPGM 0 +... + +# Make sure we have wait to mitigate WAW on gfx12 + +--- +name: scratch_load_waw +body: | + bb.0.entry: + liveins: $vgpr0, $sgpr0 + + ; GFX9-LABEL: name: scratch_load_waw + ; GFX9: liveins: $vgpr0, $sgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9-NEXT: $vgpr2 = SCRATCH_LOAD_SHORT_D16_HI_SADDR $sgpr0, 0, 0, $vgpr2, implicit $exec, implicit $flat_scr + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: scratch_load_waw + ; GFX12: liveins: $vgpr0, $sgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX12-NEXT: S_WAIT_LOADCNT 0 + ; GFX12-NEXT: $vgpr2 = SCRATCH_LOAD_SHORT_D16_HI_SADDR $sgpr0, 0, 0, $vgpr2, implicit $exec, implicit $flat_scr + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = SCRATCH_LOAD_SHORT_D16_HI_SADDR $sgpr0, 0, 0, $vgpr2, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir index d69cb448b95de..7a807260d142d 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir @@ -321,8 +321,8 @@ body: | ; GCN-NEXT: BUNDLE implicit-def $agpr0, implicit $vgpr2_vgpr3 { ; GCN-NEXT: $agpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: } - ; GCN-NEXT: S_WAITCNT 112 ; GCN-NEXT: BUNDLE implicit $agpr0, implicit $vgpr2_vgpr3 { + ; GCN-NEXT: S_WAITCNT 112 ; GCN-NEXT: FLAT_STORE_DWORD $vgpr2_vgpr3, $agpr0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: } BUNDLE implicit-def $agpr0, implicit $vgpr2_vgpr3 { diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt.mir index 4051be18dd49f..8528de77533bf 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt.mir @@ -301,8 +301,8 @@ body: | # CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { # CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr # CHECK-NEXT: } -# CHECK-NEXT: S_WAITCNT 112 # CHECK-NEXT: BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 { +# CHECK-NEXT: S_WAITCNT 112 # CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr # CHECK-NEXT: }