diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 4262e7b5d9c25..eafe20be17d5b 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2390,7 +2390,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, } if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) return true; - return HasVMemLoad && UsesVgprLoadedOutside; + return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir index bdef55ab956a0..0ddd2aa285b26 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -295,7 +295,7 @@ body: | # GFX12-LABEL: waitcnt_vm_loop2 # GFX12-LABEL: bb.0: # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN -# GFX12: S_WAIT_LOADCNT 0 +# GFX12-NOT: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: @@ -342,7 +342,7 @@ body: | # GFX12-LABEL: waitcnt_vm_loop2_store # GFX12-LABEL: bb.0: # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN -# GFX12: S_WAIT_LOADCNT 0 +# GFX12-NOT: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: @@ -499,9 +499,9 @@ body: | # GFX12-LABEL: waitcnt_vm_loop2_reginterval # GFX12-LABEL: bb.0: # GFX12: GLOBAL_LOAD_DWORDX4 -# GFX12: S_WAIT_LOADCNT 0 -# GFX12-LABEL: bb.1: # GFX12-NOT: S_WAIT_LOADCNT 0 +# GFX12-LABEL: bb.1: +# GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: name: waitcnt_vm_loop2_reginterval body: | @@ -600,7 +600,7 @@ body: | # GFX12-LABEL: bb.0: # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN # GFX12: BUFFER_LOAD_FORMAT_X_IDXEN -# GFX12: S_WAIT_LOADCNT 0 +# GFX12-NOT: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.1: # GFX12: S_WAIT_LOADCNT 0 # GFX12-LABEL: bb.2: