From 0cee5a6c95f5504d5d670ff4042e53dc53b8965c Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 13 Dec 2024 15:00:22 +0800 Subject: [PATCH] [MachineLICM] Use `RegisterClassInfo::getRegPressureSetLimit` `RegisterClassInfo::getRegPressureSetLimit` is a wrapper of `TargetRegisterInfo::getRegPressureSetLimit` with some logics to adjust the limit by removing reserved registers. It seems that we shouldn't use `TargetRegisterInfo::getRegPressureSetLimit` directly, just like the comment "This limit must be adjusted dynamically for reserved registers" said. Separate from https://github.com/llvm/llvm-project/pull/118787 --- llvm/lib/CodeGen/MachineLICM.cpp | 4 +- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 670 +- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 670 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 44 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 6212 +++++++++-------- .../buffer-fat-pointer-atomicrmw-fmax.ll | 4454 ++++++------ .../buffer-fat-pointer-atomicrmw-fmin.ll | 4454 ++++++------ .../codegen-prepare-addrspacecast-non-null.ll | 8 +- ...cannot-create-empty-or-backward-segment.ll | 95 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 2524 ++++--- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 5680 +++++++-------- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 5680 +++++++-------- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 2364 ++++--- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 1706 ++--- .../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 1672 +++-- .../flat_atomics_i64_system_noprivate.ll | 2034 +++--- .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 12 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 3006 ++++---- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 4987 +++++++------ .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 4987 +++++++------ .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 2384 ++++--- .../AMDGPU/global-load-saddr-to-vaddr.ll | 14 +- .../AMDGPU/global_atomics_i32_system.ll | 706 +- .../AMDGPU/global_atomics_i64_system.ll | 1944 +++--- .../AMDGPU/global_atomics_scan_fmax.ll | 655 +- .../AMDGPU/global_atomics_scan_fmin.ll | 655 +- .../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 70 +- llvm/test/CodeGen/AMDGPU/licm-regpressure.mir | 38 +- .../llvm.amdgcn.struct.atomic.buffer.load.ll | 150 +- ...vm.amdgcn.struct.ptr.atomic.buffer.load.ll | 150 +- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 1034 +-- .../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 1380 ++-- .../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 1380 ++-- .../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 1120 ++- .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 10 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 54 +- .../CodeGen/AMDGPU/optimize-negated-cond.ll | 6 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 2 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 2 +- .../AMDGPU/tuple-allocation-failure.ll | 292 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 2 +- .../AMDGPU/undefined-subreg-liverange.ll | 7 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 2 +- ...r-descriptor-waterfall-loop-idom-update.ll | 6 +- llvm/test/CodeGen/LoongArch/jr-without-ra.ll | 112 +- .../RISCV/rvv/vxrm-insert-out-of-loop.ll | 5 +- .../test/CodeGen/Thumb2/mve-blockplacement.ll | 128 +- .../CodeGen/Thumb2/mve-gather-increment.ll | 788 +-- .../Thumb2/mve-gather-scatter-optimisation.ll | 140 +- .../InferAddressSpaces/AMDGPU/flat_atomic.ll | 28 +- 50 files changed, 32480 insertions(+), 32047 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index d1d5509dc482a..798c3461094a8 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -124,6 +124,7 @@ namespace { const TargetRegisterInfo *TRI = nullptr; const MachineFrameInfo *MFI = nullptr; MachineRegisterInfo *MRI = nullptr; + RegisterClassInfo RegClassInfo; TargetSchedModel SchedModel; bool PreRegAlloc = false; bool HasProfileData = false; @@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) { MFI = &MF.getFrameInfo(); MRI = &MF.getRegInfo(); SchedModel.init(&ST); + RegClassInfo.runOnMachineFunction(MF); HasProfileData = MF.getFunction().hasProfileData(); @@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) { std::fill(RegPressure.begin(), RegPressure.end(), 0); RegLimit.resize(NumRPS); for (unsigned i = 0, e = NumRPS; i != e; ++i) - RegLimit[i] = TRI->getRegPressureSetLimit(MF, i); + RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i); } if (HoistConstLoads) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 23f24a9dc9982..bd2bbb9798312 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1504,20 +1504,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX940-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, s16 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1553,19 +1554,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1579,23 +1581,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1606,23 +1609,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -1660,23 +1664,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v1, s16 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, s16 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1708,22 +1713,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, s20 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1733,23 +1739,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1759,23 +1766,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1802,26 +1810,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1845,27 +1854,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1897,27 +1908,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 @@ -1928,27 +1940,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 @@ -1976,24 +1989,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2017,25 +2032,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v2, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2067,26 +2085,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2096,26 +2115,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 11024b0a88d6b..b646b7f301092 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -325,13 +325,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -370,13 +370,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -394,13 +394,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -469,21 +469,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -513,20 +513,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -536,20 +536,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -602,15 +602,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -640,15 +640,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -686,15 +686,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -712,15 +712,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -758,21 +758,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -795,22 +795,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -840,21 +840,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -864,21 +864,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1058,21 +1058,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB9_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1104,20 +1104,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1127,20 +1127,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1190,15 +1190,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1228,15 +1228,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1274,15 +1274,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1303,15 +1303,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -1344,21 +1344,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[8:9] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1381,22 +1381,22 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -1427,21 +1427,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1451,24 +1451,24 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1504,20 +1504,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s16 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX940-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, s16 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1553,19 +1554,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s20 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s20 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 @@ -1579,23 +1581,24 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -1606,23 +1609,24 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -1660,23 +1664,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s16 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: v_mov_b32_e32 v1, s16 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX940-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, s16 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1708,22 +1713,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s20 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: v_mov_b32_e32 v1, s20 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v1, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX90A-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, s20 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1733,23 +1739,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: v_mov_b32_e32 v1, s20 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1759,23 +1766,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1802,26 +1810,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v10, s16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1845,27 +1854,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1897,27 +1908,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s20 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 @@ -1928,27 +1940,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 @@ -1976,24 +1989,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, s16 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2017,25 +2032,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v2, s16 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2067,26 +2085,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v2, s20 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX908-NEXT: v_mov_b32_e32 v10, s20 +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2096,26 +2115,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, s20 +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 823db84a053b8..45a9f9d47acba 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7 ; GFX908-NEXT: s_mul_i32 s0, s0, s7 ; GFX908-NEXT: s_add_i32 s1, s9, s1 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -571,17 +571,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 ; GFX908-NEXT: s_mov_b32 s7, s6 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s6 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: v_mov_b32_e32 v9, s7 ; GFX908-NEXT: v_mov_b32_e32 v5, s7 ; GFX908-NEXT: v_mov_b32_e32 v7, s7 ; GFX908-NEXT: v_mov_b32_e32 v8, s6 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 @@ -601,9 +599,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s18, s18, s14 +; GFX908-NEXT: s_add_u32 s18, s18, s0 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s19, s19, s15 +; GFX908-NEXT: s_addc_u32 s19, s19, s1 ; GFX908-NEXT: s_mov_b64 s[20:21], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 @@ -622,7 +620,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -650,7 +648,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_mov_b64 s[20:21], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17] +; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -661,7 +659,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[0:1], -1 +; GFX908-NEXT: s_mov_b64 s[14:15], -1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 @@ -670,7 +668,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_addc_u32 s5, s5, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 -; GFX908-NEXT: s_mov_b64 s[0:1], 0 +; GFX908-NEXT: s_mov_b64 s[14:15], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -720,11 +718,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7 ; GFX90A-NEXT: s_mul_i32 s0, s0, s7 ; GFX90A-NEXT: s_add_i32 s1, s9, s1 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 @@ -734,14 +732,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 ; GFX90A-NEXT: s_mov_b32 s7, s6 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -760,8 +756,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s18, s18, s14 -; GFX90A-NEXT: s_addc_u32 s19, s19, s15 +; GFX90A-NEXT: s_add_u32 s18, s18, s0 +; GFX90A-NEXT: s_addc_u32 s19, s19, s1 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[20:21], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] @@ -781,7 +777,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 @@ -802,7 +798,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_mov_b64 s[20:21], -1 ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17] +; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -813,7 +809,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_xor_b64 s[16:17], s[20:21], -1 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[0:1], -1 +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 @@ -822,7 +818,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_addc_u32 s5, s5, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_mov_b64 s[14:15], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index e8f1619c5d418..48c7a9f5e2d14 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -54,23 +54,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 @@ -95,18 +95,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -122,18 +122,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -149,18 +149,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB0_1 @@ -176,19 +176,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB0_1 @@ -241,23 +241,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 @@ -290,19 +290,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -316,19 +316,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -342,20 +342,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -827,23 +827,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -859,12 +859,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -885,18 +885,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -912,18 +912,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -939,18 +939,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -966,19 +966,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1031,23 +1031,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 @@ -1062,11 +1062,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f32_e32 v2, v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1087,19 +1087,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1113,19 +1113,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1139,19 +1139,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1165,20 +1165,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1218,25 +1218,24 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1250,23 +1249,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 @@ -1282,12 +1281,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1308,18 +1307,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1335,18 +1334,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1362,18 +1361,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB5_1 @@ -1389,19 +1388,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB5_1 @@ -1442,25 +1441,24 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1474,23 +1472,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 @@ -1506,12 +1504,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1532,18 +1530,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 @@ -1559,18 +1557,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 @@ -1586,18 +1584,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 @@ -1613,19 +1611,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 @@ -1666,25 +1664,24 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1698,23 +1695,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 @@ -1730,12 +1727,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1756,18 +1753,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 @@ -1783,18 +1780,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 @@ -1810,18 +1807,18 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 @@ -1837,19 +1834,19 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 @@ -1876,25 +1873,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1919,25 +1916,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1952,26 +1949,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -1997,21 +1994,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2028,21 +2025,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2059,21 +2056,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2090,22 +2087,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2127,25 +2124,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1] -; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_mov_b32_e32 v8, v4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2169,25 +2165,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4 +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2200,26 +2195,26 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v4 +; GFX10-NEXT: v_mov_b32_e32 v9, v5 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v7, v3 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v4, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, v8 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v4, v6 +; GFX10-NEXT: v_mov_b32_e32 v5, v7 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 @@ -2243,22 +2238,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX908-NEXT: v_mov_b32_e32 v10, v5 -; GFX908-NEXT: v_mov_b32_e32 v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2272,22 +2267,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, v5 -; GFX8-NEXT: v_mov_b32_e32 v9, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2301,22 +2296,22 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v10, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2330,23 +2325,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v5 -; GFX6-NEXT: v_mov_b32_e32 v9, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v5, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2903,25 +2898,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2946,25 +2941,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2979,26 +2974,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 @@ -3015,18 +3010,18 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v10, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 @@ -3043,21 +3038,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 @@ -3074,21 +3069,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 @@ -3105,21 +3100,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 @@ -3136,22 +3131,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 @@ -3174,25 +3169,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -3217,25 +3212,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -3250,26 +3245,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 @@ -3295,21 +3290,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -3326,21 +3321,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 @@ -3357,21 +3352,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 @@ -3388,22 +3383,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 -; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 +; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 @@ -3431,43 +3426,43 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,24 +3470,25 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX940-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3502,49 +3498,51 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3552,21 +3550,22 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc @@ -3575,35 +3574,36 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3613,29 +3613,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v5, s4 -; GFX908-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -3647,30 +3648,31 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: buffer_load_dword v2, v5, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_and_b32_e32 v3, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3682,36 +3684,37 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3721,7 +3724,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -3729,30 +3732,31 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3762,7 +3766,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3782,42 +3786,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3825,24 +3829,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX940-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX940-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3858,41 +3863,43 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3900,57 +3907,59 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 -; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -3966,29 +3975,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v3, s4 -; GFX908-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3999,30 +4009,31 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: buffer_load_dword v2, v3, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4033,34 +4044,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4071,35 +4083,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4768,28 +4781,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4799,23 +4811,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4823,32 +4835,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v5, s6 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4858,33 +4871,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4894,95 +4906,97 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4992,38 +5006,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5033,40 +5048,41 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5076,36 +5092,37 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5115,7 +5132,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -5123,30 +5140,31 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -5156,7 +5174,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5174,53 +5192,52 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5228,32 +5245,33 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5269,122 +5287,123 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -5400,36 +5419,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5440,38 +5460,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5482,34 +5503,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5520,35 +5542,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6293,25 +6316,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6325,23 +6347,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6366,18 +6388,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -6393,20 +6415,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -6429,30 +6451,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 @@ -6475,31 +6497,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 @@ -6541,25 +6563,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -6572,23 +6592,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -6621,21 +6641,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6646,41 +6666,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 @@ -6692,42 +6712,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 @@ -7268,25 +7288,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7300,23 +7319,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 @@ -7332,12 +7351,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7358,18 +7377,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -7385,20 +7404,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 @@ -7421,30 +7440,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 @@ -7467,31 +7486,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 @@ -7533,25 +7552,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7564,23 +7581,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 @@ -7595,11 +7612,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7620,19 +7637,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7646,21 +7663,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7671,41 +7688,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 @@ -7717,42 +7734,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 @@ -7793,25 +7810,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -7825,23 +7841,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -7857,12 +7873,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7883,18 +7899,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB24_1 @@ -7910,20 +7926,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 @@ -7946,30 +7962,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 @@ -7992,31 +8008,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 @@ -8058,25 +8074,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -8089,23 +8103,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -8120,11 +8134,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8145,19 +8159,19 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v1, v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8171,21 +8185,21 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_add_f16_e32 v3, v2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8196,41 +8210,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 @@ -8242,42 +8256,42 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 @@ -8312,41 +8326,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 @@ -8357,47 +8371,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8410,41 +8422,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -8455,40 +8467,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 @@ -8499,41 +8511,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 @@ -8544,42 +8556,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 @@ -8591,38 +8603,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8633,39 +8645,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8696,40 +8708,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8739,43 +8751,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -8789,39 +8803,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -8833,39 +8847,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8876,40 +8890,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8920,41 +8934,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8965,38 +8979,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9007,39 +9021,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9092,7 +9106,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -9104,40 +9118,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 ; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX940-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX940-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -9150,27 +9164,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB28_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB28_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -9184,42 +9198,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v6, v10, v7 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -9233,14 +9246,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB28_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -9248,15 +9261,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB28_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 @@ -9268,38 +9280,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v10, v7 +; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -9311,15 +9323,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -9328,13 +9340,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB28_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9346,38 +9358,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX90A-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -9389,27 +9401,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9421,39 +9433,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX908-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX908-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -9465,27 +9477,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB28_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB28_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9497,40 +9509,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX8-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX8-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -9542,21 +9554,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB28_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -9730,41 +9742,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB29_1 @@ -9775,47 +9787,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -9828,41 +9838,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB29_1 @@ -9873,40 +9883,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 @@ -9917,41 +9927,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB29_1 @@ -9962,42 +9972,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB29_1 @@ -10009,38 +10019,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10051,39 +10061,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10114,40 +10124,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10157,43 +10167,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -10207,39 +10219,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -10251,39 +10263,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10294,40 +10306,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10338,41 +10350,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10383,38 +10395,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10425,39 +10437,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10487,41 +10499,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX940-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 @@ -10532,47 +10544,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -10585,41 +10595,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -10630,40 +10640,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 @@ -10674,41 +10684,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 @@ -10719,42 +10729,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 @@ -10766,38 +10776,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10808,39 +10818,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10871,40 +10881,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10914,43 +10924,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -10964,39 +10976,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -11008,39 +11020,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11051,40 +11063,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11095,41 +11107,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11140,38 +11152,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11182,39 +11194,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11245,40 +11257,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX940-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11288,43 +11300,45 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -11338,39 +11352,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -11382,39 +11396,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_add_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11425,40 +11439,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11469,41 +11483,41 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11514,38 +11528,38 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11556,39 +11570,39 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11646,23 +11660,23 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB34_1 @@ -11678,12 +11692,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -11706,18 +11720,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 @@ -11733,18 +11747,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB34_1 @@ -11760,18 +11774,18 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 @@ -11787,19 +11801,19 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index c7511a2df9fe1..b00f8aecbd2f5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX940-NEXT: v_max_f32_e32 v6, v4, v9 +; GFX940-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX940-NEXT: v_max_f32_e32 v8, v6, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB2_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9 +; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX90A-NEXT: v_max_f32_e32 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v4, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_max_f32_e32 v6, v8, v8 +; GFX908-NEXT: v_max_f32_e32 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -800,27 +800,28 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -832,27 +833,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -863,19 +864,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -891,25 +892,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -926,19 +927,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -955,19 +956,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -984,20 +985,20 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1028,19 +1029,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -1079,19 +1080,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1107,25 +1108,25 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 @@ -1142,19 +1143,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 @@ -1197,30 +1198,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[6:7], v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1437,27 +1437,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1490,27 +1491,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1521,27 +1522,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1605,17 +1606,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[2:3], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1709,17 +1710,17 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1837,15 +1838,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1903,15 +1904,15 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[11:12], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -2011,30 +2012,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v10, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 @@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 -; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2300,30 +2300,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2346,30 +2345,30 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2401,29 +2400,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 @@ -2434,29 +2433,29 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 @@ -2500,47 +2499,47 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2548,256 +2547,263 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX940-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: v_max_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2807,7 +2813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2815,30 +2821,31 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2848,7 +2855,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2866,46 +2873,46 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2913,31 +2920,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX940-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2948,111 +2956,114 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: v_max_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3063,31 +3074,32 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3098,32 +3110,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3134,34 +3147,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3172,35 +3186,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3220,15 +3235,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3243,31 +3258,30 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v6 +; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3283,15 +3297,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3299,7 +3313,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3307,12 +3321,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v11, v6 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -3324,24 +3338,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3355,36 +3369,36 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v8 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 +; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-NEXT: v_not_b32_e32 v11, v7 ; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -3396,30 +3410,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX11-NEXT: v_max_f16_e32 v6, v6, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -3433,14 +3446,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3449,20 +3462,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3472,26 +3485,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX10-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3503,15 +3516,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3520,19 +3533,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3544,24 +3557,24 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3573,33 +3586,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3611,25 +3624,25 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX908-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX908-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3641,33 +3654,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3679,26 +3692,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3710,21 +3723,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3885,28 +3898,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -3916,23 +3928,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3940,32 +3952,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v5, s6 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3975,33 +3988,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4011,95 +4023,97 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4109,38 +4123,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4150,40 +4165,41 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4193,37 +4209,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4233,7 +4250,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4241,31 +4258,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4275,7 +4293,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4293,53 +4311,52 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4347,32 +4364,33 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4388,122 +4406,123 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4519,36 +4538,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4559,38 +4579,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4601,35 +4622,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4640,36 +4662,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5396,29 +5419,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5431,21 +5452,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_pk_max_f16 v4, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -5461,28 +5481,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5494,27 +5513,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -5525,19 +5544,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5553,25 +5572,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 @@ -5582,29 +5601,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -5627,30 +5646,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 @@ -5673,31 +5692,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 @@ -5719,26 +5738,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_max_num_f16 v1, v3, v1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5752,25 +5770,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5781,25 +5798,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5812,25 +5829,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v1, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5842,23 +5859,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5869,24 +5886,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5897,28 +5914,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5929,41 +5946,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 @@ -5975,42 +5992,42 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 @@ -6031,7 +6048,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6047,26 +6064,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_pk_max_num_f16 v7, v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6082,15 +6099,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6098,14 +6115,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6117,23 +6134,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX940-NEXT: v_pk_max_f16 v6, v9, v9 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_pk_max_f16 v6, v4, v9 +; GFX940-NEXT: v_pk_max_f16 v8, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6146,27 +6163,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6180,25 +6197,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v5, v4, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_pk_max_f16 v7, v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6212,14 +6229,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6228,13 +6245,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6246,24 +6263,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX10-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v5, v4, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_pk_max_f16 v7, v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -6275,15 +6292,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -6292,13 +6309,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6310,22 +6327,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 -; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 +; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX90A-NEXT: v_pk_max_f16 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6337,27 +6354,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6369,23 +6386,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX908-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 -; GFX908-NEXT: v_pk_max_f16 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX908-NEXT: v_pk_max_f16 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6397,27 +6414,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6429,27 +6446,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v7, v8, v8 +; GFX8-NEXT: v_max_f16_e32 v6, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6461,21 +6478,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6652,45 +6669,43 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-NEXT: v_max_num_f32_e32 v1, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX12-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6703,41 +6718,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX940-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB19_1 @@ -6748,47 +6763,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_max_f32_e32 v1, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_dual_max_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6801,41 +6814,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6846,40 +6859,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 @@ -6890,41 +6903,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -6935,42 +6948,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -6982,38 +6995,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7024,39 +7037,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7076,41 +7089,43 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_max_num_f32_e32 v1, v3, v1 +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7124,40 +7139,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX940-NEXT: v_max_f32_e32 v2, v5, v4 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7167,43 +7182,45 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7217,39 +7234,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -7261,39 +7278,39 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7304,40 +7321,40 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7348,41 +7365,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_max_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7394,37 +7411,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7436,38 +7453,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7487,7 +7504,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7503,42 +7520,43 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8 -; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v6, v10, v7 +; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7554,15 +7572,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7570,14 +7588,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7589,40 +7607,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 ; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX940-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7635,27 +7653,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7669,42 +7687,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v6, v10, v7 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7718,14 +7735,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -7733,15 +7750,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7753,38 +7769,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_max_f32_e32 v6, v10, v7 +; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -7796,15 +7812,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -7813,13 +7829,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7831,38 +7847,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX90A-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7874,27 +7890,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7906,39 +7922,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v8 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX908-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7950,27 +7966,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7982,40 +7998,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX8-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX8-NEXT: v_max_f32_e32 v6, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -8027,21 +8043,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -8220,19 +8236,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -8271,19 +8287,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -8301,25 +8317,25 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -8336,19 +8352,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 0bcaacc6b08e8..0d38ee36415b8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -31,19 +31,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -82,19 +82,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -110,25 +110,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 @@ -145,19 +145,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 @@ -207,24 +207,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB1_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -257,23 +257,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -284,24 +284,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -316,20 +316,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -400,7 +400,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -412,22 +412,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX940-NEXT: v_min_f32_e32 v6, v4, v9 +; GFX940-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX940-NEXT: v_min_f32_e32 v8, v6, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -441,21 +441,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB2_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -520,7 +520,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -532,22 +532,22 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 -; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9 +; GFX90A-NEXT: v_max_f32_e32 v6, v9, v9 +; GFX90A-NEXT: v_min_f32_e32 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -559,27 +559,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -591,23 +591,23 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 +; GFX908-NEXT: v_max_f32_e32 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v4, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_max_f32_e32 v6, v8, v8 +; GFX908-NEXT: v_min_f32_e32 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -619,21 +619,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB2_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -771,19 +771,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -800,27 +800,28 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -832,27 +833,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f32_e32 v1, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 @@ -863,19 +864,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -891,25 +892,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 @@ -926,19 +927,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 @@ -955,19 +956,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX7-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 @@ -984,20 +985,20 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX6-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 @@ -1028,19 +1029,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -1079,19 +1080,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1107,25 +1108,25 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 @@ -1142,19 +1143,19 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 @@ -1197,30 +1198,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1243,30 +1243,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1298,29 +1298,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 @@ -1331,29 +1331,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB5_1 @@ -1393,27 +1393,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, s16 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[6:7], v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX12-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1437,27 +1437,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s16 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, v5 +; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[6:9], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v6 :: v_dual_mov_b32 v5, v7 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -1490,27 +1491,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, s20 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v9, v5 +; GFX908-NEXT: v_mov_b32_e32 v8, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v3 +; GFX908-NEXT: v_mov_b32_e32 v6, v2 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: v_mov_b32_e32 v5, v7 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1521,27 +1522,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048 ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v8, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v7 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1605,17 +1606,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[5:6], v[5:6] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[13:14], v[13:14] ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[2:3], v[0:1] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1709,17 +1710,17 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 @@ -1837,15 +1838,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 @@ -1903,15 +1904,15 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: v_max_f64 v[0:1], v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: v_max_f64 v[2:3], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[11:12], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 @@ -2011,30 +2012,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2057,30 +2057,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2092,31 +2092,31 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: s_add_i32 s4, s20, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_add_i32 s5, s20, 0x800 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-NEXT: v_mov_b32_e32 v2, v9 -; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 @@ -2127,26 +2127,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v6, s6 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v10, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 @@ -2157,29 +2157,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB8_1 @@ -2190,29 +2190,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 @@ -2223,29 +2223,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX7-NEXT: s_add_i32 s6, s20, 0x800 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v10, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, v0 -; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-NEXT: v_mov_b32_e32 v2, v9 -; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, v6 +; GFX7-NEXT: v_mov_b32_e32 v1, v7 +; GFX7-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-NEXT: v_mov_b32_e32 v3, v9 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 @@ -2256,30 +2256,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 ; GFX6-NEXT: s_add_i32 s6, s20, 0x800 -; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v10, v1 -; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: v_mov_b32_e32 v9, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, v7 -; GFX6-NEXT: v_mov_b32_e32 v1, v8 -; GFX6-NEXT: v_mov_b32_e32 v2, v9 -; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v10, s6 +; GFX6-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v1, v7 +; GFX6-NEXT: v_mov_b32_e32 v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v3, v9 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 @@ -2300,30 +2300,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x800 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v10, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2346,30 +2345,30 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_add_i32 s5, s16, 0x800 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -2401,29 +2400,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_add_i32 s6, s20, 0x800 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_mov_b32_e32 v10, s6 +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 +; GFX908-NEXT: v_mov_b32_e32 v2, v8 +; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 @@ -2434,29 +2433,29 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_add_i32 s6, s20, 0x800 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_mov_b32_e32 v10, s6 +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 +; GFX8-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 @@ -2500,47 +2499,47 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2548,256 +2547,263 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX940-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: v_min_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2807,7 +2813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2815,30 +2821,31 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -2848,7 +2855,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2866,46 +2873,46 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-NEXT: v_min_num_f16_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s5, v1 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2913,31 +2920,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX940-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX940-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX940-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX940-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2948,111 +2956,114 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: v_min_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s5, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, s7, v3 +; GFX90A-NEXT: v_max_f16_e32 v2, v0, v0 +; GFX90A-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX90A-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3063,31 +3074,32 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX908-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX908-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s8, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3098,32 +3110,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s7, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, s8, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3134,34 +3147,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3172,35 +3186,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3220,15 +3235,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 +; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-NEXT: v_not_b32_e32 v11, v7 ; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3243,31 +3258,30 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-NEXT: v_max_num_f16_e32 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v6, v6, v6 +; GFX12-NEXT: v_min_num_f16_e32 v6, v6, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -3283,15 +3297,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3299,7 +3313,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3307,12 +3321,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX940-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 -; GFX940-NEXT: v_not_b32_e32 v10, v4 +; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v11, v6 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -3324,24 +3338,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen +; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3355,36 +3369,36 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v8 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 +; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-NEXT: v_not_b32_e32 v11, v7 ; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -3396,30 +3410,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX11-NEXT: v_min_f16_e32 v6, v6, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -3433,14 +3446,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3449,20 +3462,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX10-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v11, v7 ; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -3472,26 +3485,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: v_max_f16_e32 v8, v5, v5 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX10-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -3503,15 +3516,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -3520,19 +3533,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 -; GFX90A-NEXT: v_not_b32_e32 v10, v4 +; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v11, v6 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3544,24 +3557,24 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen +; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 ; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -3573,33 +3586,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v8 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 -; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX908-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX908-NEXT: v_not_b32_e32 v9, v4 +; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v11, v6 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3611,25 +3624,25 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX908-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX908-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX908-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -3641,33 +3654,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 -; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX8-NEXT: v_and_b32_e32 v10, -4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 -; GFX8-NEXT: v_not_b32_e32 v9, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v11, v6 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3679,26 +3692,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX8-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX8-NEXT: v_max_f16_e32 v8, v5, v5 +; GFX8-NEXT: v_min_f16_e32 v6, v6, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -3710,21 +3723,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -3885,28 +3898,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -3916,23 +3928,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -3940,32 +3952,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v5, s6 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX940-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -3975,33 +3988,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 @@ -4011,95 +4023,97 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s5, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4109,38 +4123,39 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 +; GFX908-NEXT: v_add3_u32 v2, v2, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4150,40 +4165,41 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4193,37 +4209,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4233,7 +4250,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -4241,31 +4258,32 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 @@ -4275,7 +4293,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4293,53 +4311,52 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_b32 s4, s16, -4 +; GFX12-NEXT: s_and_b32 s5, s16, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_lshl_b32 s5, s5, 3 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-NEXT: s_lshl_b32 s6, 0xffff, s5 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_not_b32 s7, s6 +; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s6, 0 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v3 +; GFX12-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -4347,32 +4364,33 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_addk_i32 s16, 0x200 -; GFX940-NEXT: s_and_b32 s4, s16, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX940-NEXT: s_and_b32 s6, s16, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX940-NEXT: s_and_b32 s4, s16, 3 -; GFX940-NEXT: s_lshl_b32 s6, s4, 3 -; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX940-NEXT: s_not_b32 s7, s4 +; GFX940-NEXT: s_lshl_b32 s7, s4, 3 +; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX940-NEXT: s_not_b32 s8, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX940-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v0, v1, s8, v0 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4388,122 +4406,123 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_and_b32 s5, s16, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX11-NEXT: s_not_b32 s7, s6 +; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s5, v1 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_and_or_b32 v0, v1, s7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: s_or_b32 s6, vcc_lo, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_addk_i32 s20, 0x200 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: s_and_b32 s4, s20, -4 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_and_b32 s4, s20, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s5, s20, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX10-NEXT: s_not_b32 s7, s6 +; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen +; GFX10-NEXT: s_mov_b32 s6, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: s_or_b32 s6, vcc_lo, s6 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_addk_i32 s20, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s20, -4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX90A-NEXT: s_and_b32 s6, s20, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX90A-NEXT: s_and_b32 s4, s20, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 +; GFX90A-NEXT: s_lshl_b32 s7, s4, 3 +; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX90A-NEXT: s_not_b32 s8, s4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s9 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 @@ -4519,36 +4538,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_addk_i32 s20, 0x200 -; GFX908-NEXT: s_and_b32 s4, s20, -4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX908-NEXT: s_and_b32 s6, s20, -4 +; GFX908-NEXT: v_mov_b32_e32 v1, s6 +; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX908-NEXT: s_and_b32 s4, s20, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 +; GFX908-NEXT: s_lshl_b32 s7, s4, 3 +; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX908-NEXT: s_not_b32 s8, s4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v0, s9 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX908-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s8, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4559,38 +4579,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_addk_i32 s20, 0x200 -; GFX8-NEXT: s_and_b32 s4, s20, -4 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX8-NEXT: s_and_b32 s6, s20, -4 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX8-NEXT: s_and_b32 s4, s20, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 +; GFX8-NEXT: s_lshl_b32 s7, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_not_b32 s8, s4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4601,35 +4622,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_addk_i32 s20, 0x200 -; GFX7-NEXT: s_and_b32 s4, s20, -4 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX7-NEXT: s_and_b32 s6, s20, -4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_lshl_b32 s7, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 +; GFX7-NEXT: s_not_b32 s8, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4640,36 +4662,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_addk_i32 s20, 0x200 -; GFX6-NEXT: s_and_b32 s4, s20, -4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen +; GFX6-NEXT: s_and_b32 s6, s20, -4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_lshl_b32 s7, s4, 3 +; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s7 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 +; GFX6-NEXT: s_not_b32 s8, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s7, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5396,29 +5419,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 -; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v1, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5431,21 +5452,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: v_pk_min_f16 v4, v0, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -5461,28 +5481,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5494,27 +5513,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_max_f16 v1, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 @@ -5525,19 +5544,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -5553,25 +5572,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 -; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 @@ -5582,29 +5601,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 @@ -5627,30 +5646,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 @@ -5673,31 +5692,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX6-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v5 +; GFX6-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v0 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 @@ -5719,26 +5738,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-NEXT: s_add_co_i32 s5, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v1, v0, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_min_num_f16 v1, v3, v1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5752,25 +5770,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5781,25 +5798,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-NEXT: s_add_i32 s5, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_min_f16 v1, v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 @@ -5812,25 +5829,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_i32 s5, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX10-NEXT: v_pk_min_f16 v1, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 @@ -5842,23 +5859,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5869,24 +5886,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v1, v0, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v1, v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5897,28 +5914,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v4, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_min_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5929,41 +5946,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 @@ -5975,42 +5992,42 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v7, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 @@ -6031,7 +6048,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6047,26 +6064,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v5, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 +; GFX12-NEXT: v_pk_max_num_f16 v6, v8, v8 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_pk_min_num_f16 v7, v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6082,15 +6099,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6098,14 +6115,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6117,23 +6134,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 +; GFX940-NEXT: v_pk_max_f16 v6, v9, v9 ; GFX940-NEXT: s_mov_b64 s[8:9], exec -; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 +; GFX940-NEXT: v_pk_min_f16 v8, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -6146,27 +6163,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB18_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6180,25 +6197,25 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX11-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_pk_min_f16 v7, v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -6212,14 +6229,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6228,13 +6245,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 @@ -6246,24 +6263,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX10-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 +; GFX10-NEXT: v_pk_max_f16 v6, v8, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_pk_min_f16 v7, v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -6275,15 +6292,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -6292,13 +6309,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6310,22 +6327,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 -; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 +; GFX90A-NEXT: v_pk_max_f16 v6, v9, v9 +; GFX90A-NEXT: v_pk_min_f16 v8, v6, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -6337,27 +6354,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6369,23 +6386,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX908-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 -; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_pk_max_f16 v6, v8, v8 +; GFX908-NEXT: v_pk_min_f16 v7, v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -6397,27 +6414,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6429,27 +6446,27 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 ; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX8-NEXT: v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v9 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_max_f16_sdwa v6, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v5, v5 +; GFX8-NEXT: v_max_f16_e32 v7, v8, v8 +; GFX8-NEXT: v_min_f16_e32 v6, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -6461,21 +6478,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -6652,45 +6669,43 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s16 +; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-NEXT: v_min_num_f32_e32 v1, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-NEXT: v_dual_min_num_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX12-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6703,41 +6718,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v1, v4, v1 +; GFX940-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX940-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] -; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB19_1 @@ -6748,47 +6763,45 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_and_b32 v1, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_min_f32_e32 v1, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_dual_min_f32 v0, v3, v0 :: v_dual_cndmask_b32 v1, v5, v7 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6801,41 +6814,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_min_f32_e32 v1, v5, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v3, v1, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -6846,40 +6859,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v1, v4, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v4, v1, v0, s10 +; GFX90A-NEXT: v_mov_b32_e32 v3, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 @@ -6890,41 +6903,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v0, v5 -; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v3, v3, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v1, v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB19_1 @@ -6935,42 +6948,42 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, v5 -; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 @@ -6982,38 +6995,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7024,39 +7037,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7076,41 +7089,43 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-NEXT: v_mov_b32_e32 v1, s16 +; GFX12-NEXT: s_add_co_i32 s6, s16, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_min_num_f32_e32 v1, v3, v1 +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_mov_b32_e32 v5, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7124,40 +7139,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s16 -; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s16, 0x400 +; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s8, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX940-NEXT: s_mov_b32 s9, 0x7060302 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_movk_i32 s9, 0x7fff +; GFX940-NEXT: s_mov_b32 s10, 0x7060302 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX940-NEXT: v_min_f32_e32 v2, v5, v4 +; GFX940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, s8 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX940-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] -; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7167,43 +7182,45 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_add_i32 s6, s16, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -7217,39 +7234,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s20 -; GFX10-NEXT: s_add_i32 s4, s20, 0x400 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 +; GFX10-NEXT: s_add_i32 s6, s20, 0x400 ; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX10-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v1, v1 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s6 +; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 @@ -7261,39 +7278,39 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, s20 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s8, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: s_movk_i32 s9, 0x7fff +; GFX90A-NEXT: s_mov_b32 s10, 0x7060302 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX90A-NEXT: v_min_f32_e32 v2, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s9 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v2, v1, s10 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7304,40 +7321,40 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, s20 -; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s20, 0x400 +; GFX908-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s8, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: s_movk_i32 s9, 0x7fff +; GFX908-NEXT: s_mov_b32 s10, 0x7060302 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 -; GFX908-NEXT: v_mov_b32_e32 v6, v1 -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX908-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v4, v4, v1, s9 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s9 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX908-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v1, v3, v1, s10 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7348,41 +7365,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s20 -; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s20, 0x400 +; GFX8-NEXT: buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s8, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_min_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v4 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v6, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7394,37 +7411,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s6, s20, 0x400 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7436,38 +7453,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s6, s20, 0x400 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc +; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v6, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7487,7 +7504,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo ; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7503,42 +7520,43 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX12-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 -; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v6, v10, v7 +; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -7554,15 +7572,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB21_4 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7570,14 +7588,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7589,40 +7607,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 +; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 ; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 ; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s10 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v10 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 -; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX940-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX940-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX940-NEXT: v_add3_u32 v7, v7, v6, s10 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 -; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX940-NEXT: v_perm_b32 v8, v6, v4, s11 +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9] ; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 @@ -7635,27 +7653,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_4 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7669,42 +7687,41 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v6, v10, v7 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7718,14 +7735,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB21_4 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -7733,15 +7750,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 @@ -7753,38 +7769,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 ; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v8 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v9 -; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_min_f32_e32 v6, v10, v7 +; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, v6 +; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc_lo +; GFX10-NEXT: v_perm_b32 v7, v6, v4, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v7 +; GFX10-NEXT: v_mov_b32_e32 v7, v8 ; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 @@ -7796,15 +7812,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_4 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 @@ -7813,13 +7829,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 +; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec ; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7831,38 +7847,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 ; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v10 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 -; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; GFX90A-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX90A-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v8, v6, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1] ; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 @@ -7874,27 +7890,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 +; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec ; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7906,39 +7922,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 -; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX908-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s14 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v9 -; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 -; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX908-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX908-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX908-NEXT: v_add3_u32 v7, v7, v6, s14 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX908-NEXT: v_perm_b32 v7, v6, v4, s15 +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v5, v6 +; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 @@ -7950,27 +7966,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -7982,40 +7998,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 +; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX8-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v9 -; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GFX8-NEXT: v_min_f32_e32 v6, v7, v6 +; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v7, v6, v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 @@ -8027,21 +8043,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: @@ -8220,19 +8236,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v0, s16 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_add_i32 s6, s16, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 @@ -8271,19 +8287,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc @@ -8301,25 +8317,25 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, s20 ; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX908-NEXT: s_add_i32 s6, s20, 0x400 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v4, v0 +; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v5, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 @@ -8336,19 +8352,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_ ; GFX8-NEXT: s_add_i32 s6, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v5, s[16:19], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 3216e71e6221a..28a2245406842 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -245,14 +245,14 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split ; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] -; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base ; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 -; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base ; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 ; GISEL-ASM-NEXT: .LBB7_3: ; %finally ; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5] -; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[4:5] +; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) ; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 9104dc68eb9b4..a8ddece611783 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -6,95 +6,90 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] -; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 +; CHECK-NEXT: s_load_dword s4, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4 +; CHECK-NEXT: s_load_dword s12, s[8:9], 0x4 ; CHECK-NEXT: s_add_u32 s24, s24, s15 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp1_b32 s2, 0 -; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s2, 8 -; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s2, 16 +; CHECK-NEXT: s_bitcmp1_b32 s4, 0 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s0, 24 +; CHECK-NEXT: s_bitcmp1_b32 s4, 8 +; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s4, 16 ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1 +; CHECK-NEXT: s_bitcmp1_b32 s0, 24 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_xor_b64 s[14:15], s[4:5], -1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s12, 8 ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s6, 8 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] -; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 -; CHECK-NEXT: s_and_b64 s[4:5], exec, s[4:5] -; CHECK-NEXT: s_and_b64 s[6:7], exec, s[10:11] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: s_and_b64 s[0:1], exec, s[14:15] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[18:19], 0 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_mov_b64 s[14:15], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_mov_b64 s[16:17], -1 -; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: .LBB0_2: ; %Flow7 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] +; CHECK-NEXT: s_and_b64 vcc, exec, s[20:21] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_12 ; CHECK-NEXT: .LBB0_3: ; %bb7 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[2:3] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.4: ; %bb8 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 vcc, s[4:5] +; CHECK-NEXT: s_mov_b64 vcc, s[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_mov_b64 s[22:23], s[10:11] +; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: s_mov_b64 s[16:17], -1 +; CHECK-NEXT: s_mov_b64 s[20:21], s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_6: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[16:17], -1 -; CHECK-NEXT: s_mov_b64 s[18:19], 0 -; CHECK-NEXT: s_mov_b64 s[22:23], 0 +; CHECK-NEXT: s_mov_b64 s[14:15], -1 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], 0 ; CHECK-NEXT: .LBB0_7: ; %bb10 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[18:19], -1 -; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[22:23], s[14:15] +; CHECK-NEXT: s_mov_b64 s[16:17], -1 +; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: s_mov_b64 s[20:21], s[12:13] ; CHECK-NEXT: .LBB0_8: ; %Flow9 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[22:23] -; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: ; %bb.9: ; %bb13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[16:17], 0 -; CHECK-NEXT: s_mov_b64 s[20:21], 0 -; CHECK-NEXT: s_mov_b64 vcc, s[6:7] +; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] +; CHECK-NEXT: s_mov_b64 s[18:19], 0 ; CHECK-NEXT: s_cbranch_vccz .LBB0_11 ; CHECK-NEXT: ; %bb.10: ; %bb16 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_mov_b64 s[22:23], s[12:13] +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_mov_b64 s[20:21], s[10:11] ; CHECK-NEXT: .LBB0_11: ; %Flow11 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[18:19], 0 +; CHECK-NEXT: s_mov_b64 s[16:17], 0 ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_12: ; %loop.exit.guard6 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_xor_b64 s[22:23], s[20:21], -1 -; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] +; CHECK-NEXT: s_xor_b64 s[20:21], s[18:19], -1 +; CHECK-NEXT: s_mov_b64 s[18:19], -1 +; CHECK-NEXT: s_and_b64 vcc, exec, s[20:21] ; CHECK-NEXT: s_cbranch_vccz .LBB0_16 ; CHECK-NEXT: ; %bb.13: ; %bb14 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_15 ; CHECK-NEXT: ; %bb.14: ; %bb15 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -102,22 +97,22 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; CHECK-NEXT: .LBB0_15: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[20:21], 0 +; CHECK-NEXT: s_mov_b64 s[18:19], 0 ; CHECK-NEXT: .LBB0_16: ; %Flow13 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[18:19] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_3 ; CHECK-NEXT: ; %bb.17: ; %loop.exit.guard -; CHECK-NEXT: s_and_b64 vcc, exec, s[16:17] +; CHECK-NEXT: s_and_b64 vcc, exec, s[14:15] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_22 ; CHECK-NEXT: ; %bb.18: ; %loop.exit.guard5 -; CHECK-NEXT: s_and_b64 vcc, exec, s[18:19] +; CHECK-NEXT: s_and_b64 vcc, exec, s[16:17] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_23 ; CHECK-NEXT: ; %bb.19: ; %bb17 -; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccz .LBB0_21 ; CHECK-NEXT: ; %bb.20: ; %bb19 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; CHECK-NEXT: s_cbranch_vccz .LBB0_22 ; CHECK-NEXT: .LBB0_21: ; %bb18 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index ff48a3fc98018..4281a4b74cbae 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -16515,41 +16515,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16564,34 +16562,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB68_1 @@ -16605,33 +16603,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB68_1 @@ -16645,33 +16643,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB68_1 @@ -16685,34 +16683,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB68_1 @@ -16790,41 +16788,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16840,35 +16836,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB69_1 @@ -16881,33 +16877,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 @@ -16921,33 +16917,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB69_1 @@ -16963,34 +16959,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB69_1 @@ -17078,40 +17074,38 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_dual_add_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17126,35 +17120,35 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB70_1 @@ -17171,33 +17165,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 @@ -17214,33 +17208,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB70_1 @@ -17255,34 +17249,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB70_1 @@ -17359,41 +17353,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17406,35 +17400,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB71_1 @@ -17445,36 +17439,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB71_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17484,36 +17478,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB71_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17523,37 +17517,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB71_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17626,41 +17620,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17675,35 +17669,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB72_1 @@ -17714,36 +17708,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17753,36 +17747,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB72_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17794,37 +17788,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB72_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17907,41 +17901,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[3:4] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17956,35 +17950,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB73_1 @@ -18001,28 +17995,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -18044,28 +18038,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -18083,37 +18077,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB73_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18191,41 +18185,39 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18241,35 +18233,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB74_1 @@ -18282,35 +18274,35 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB74_1 @@ -18324,33 +18316,33 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB74_1 @@ -18366,34 +18358,34 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB74_1 @@ -18471,41 +18463,41 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18520,35 +18512,35 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB75_1 @@ -18559,38 +18551,38 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB75_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18600,36 +18592,36 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB75_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18641,37 +18633,37 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB75_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18748,41 +18740,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18797,34 +18787,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB76_1 @@ -18838,33 +18828,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB76_1 @@ -18878,33 +18868,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB76_1 @@ -18918,34 +18908,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB76_1 @@ -19022,41 +19012,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19069,35 +19059,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB77_1 @@ -19108,36 +19098,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB77_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19147,36 +19137,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB77_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19186,37 +19176,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB77_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19290,41 +19280,39 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19339,34 +19327,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB78_1 @@ -19380,33 +19368,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 @@ -19420,33 +19408,33 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB78_1 @@ -19460,34 +19448,34 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB78_1 @@ -19564,41 +19552,41 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19611,35 +19599,35 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB79_1 @@ -19650,36 +19638,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19689,36 +19677,36 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB79_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19728,37 +19716,37 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB79_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 36aa73fbf8e92..36d7201c6fe46 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 @@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 @@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -935,13 +935,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -982,13 +982,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1259,15 +1259,14 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1287,14 +1286,14 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1314,13 +1313,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1338,13 +1337,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1427,13 +1426,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1472,13 +1471,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1496,13 +1495,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1573,13 +1572,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1618,13 +1617,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1642,13 +1641,13 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1715,13 +1714,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1762,13 +1761,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1786,13 +1785,13 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1871,18 +1870,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 @@ -1923,17 +1922,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 @@ -1950,17 +1949,17 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -2024,21 +2023,21 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2070,20 +2069,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2093,20 +2092,20 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2164,21 +2163,21 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2212,20 +2211,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2235,20 +2234,20 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2315,23 +2314,23 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2373,12 +2372,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2400,12 +2399,12 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2477,13 +2476,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2524,13 +2523,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2550,13 +2549,13 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2626,21 +2625,21 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2674,22 +2673,22 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2699,20 +2698,20 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2770,29 +2769,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2801,24 +2800,25 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2895,22 +2895,23 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -3000,90 +3001,91 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: .LBB18_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3091,17 +3093,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3159,7 +3162,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3186,9 +3188,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3207,6 +3210,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3262,7 +3266,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3285,9 +3288,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3306,6 +3310,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3403,7 +3408,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3426,8 +3430,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3443,6 +3448,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3458,7 +3464,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3486,8 +3491,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3503,6 +3509,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3571,7 +3578,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3598,9 +3604,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3619,6 +3626,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3675,7 +3683,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3698,9 +3705,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3719,6 +3727,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3816,7 +3825,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3839,8 +3847,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3856,6 +3865,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3871,7 +3881,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3899,8 +3908,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3916,6 +3926,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3984,7 +3995,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4001,20 +4011,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4023,18 +4034,19 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4080,7 +4092,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4093,21 +4104,22 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4115,18 +4127,19 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4212,7 +4225,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4225,38 +4237,40 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4264,7 +4278,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4279,42 +4292,44 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[2:3] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4370,14 +4385,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4390,20 +4404,21 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4411,19 +4426,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4471,13 +4487,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4487,40 +4502,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4610,11 +4627,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4625,38 +4641,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4664,13 +4682,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4681,42 +4698,44 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4775,14 +4794,13 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4795,20 +4813,21 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4816,19 +4835,20 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4877,13 +4897,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4893,40 +4912,42 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5016,11 +5037,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5031,38 +5051,40 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5070,13 +5092,12 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5087,42 +5108,44 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5181,29 +5204,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5212,24 +5235,25 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5276,29 +5300,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5306,76 +5330,78 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_4 ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_2 ; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: .LBB24_4: ; %Flow2 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_6 ; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: @@ -5383,138 +5409,140 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB24_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: .LBB24_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5522,17 +5550,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5541,37 +5570,37 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[2:3] -; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] +; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5579,17 +5608,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5604,29 +5634,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5635,24 +5665,25 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5699,29 +5730,29 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5729,22 +5760,23 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5834,90 +5866,91 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB25_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: .LBB25_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5925,17 +5958,18 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5998,9 +6032,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6012,11 +6045,12 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6049,14 +6083,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6076,9 +6110,8 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] @@ -6090,11 +6123,12 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6118,7 +6152,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6130,9 +6163,10 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6161,14 +6195,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6195,14 +6229,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6229,17 +6263,17 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6302,11 +6336,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6316,11 +6349,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6355,14 +6389,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6383,11 +6417,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6397,11 +6430,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6426,10 +6460,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6438,9 +6471,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6470,14 +6504,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6505,14 +6539,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6540,17 +6574,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6615,11 +6649,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6629,11 +6662,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6669,14 +6703,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6697,11 +6731,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6711,11 +6744,12 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6740,10 +6774,9 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6752,9 +6785,10 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6784,14 +6818,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6819,14 +6853,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6854,17 +6888,17 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6928,9 +6962,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6942,9 +6975,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -6978,13 +7012,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -7004,9 +7038,8 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] @@ -7018,9 +7051,10 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7045,7 +7079,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7057,8 +7090,9 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7087,13 +7121,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc @@ -7120,13 +7154,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -7153,16 +7187,16 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7221,36 +7255,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7268,29 +7302,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7300,37 +7334,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7342,31 +7376,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7377,31 +7411,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7411,31 +7445,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7445,32 +7479,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7525,36 +7559,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7573,29 +7607,29 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7605,37 +7639,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7647,31 +7681,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -7682,31 +7716,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7716,31 +7750,31 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7750,32 +7784,32 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7831,15 +7865,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7864,14 +7898,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX940-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 @@ -7890,15 +7924,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7922,23 +7956,23 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v0, v5, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -7951,14 +7985,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7977,14 +8011,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8005,19 +8039,19 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8067,24 +8101,24 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8098,23 +8132,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8124,25 +8158,25 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8156,23 +8190,23 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8183,22 +8217,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8208,22 +8242,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8235,22 +8269,22 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8300,11 +8334,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8314,11 +8347,12 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8354,14 +8388,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -8382,11 +8416,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8396,11 +8429,12 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8425,10 +8459,9 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8437,9 +8470,10 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8469,14 +8503,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -8506,14 +8540,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -8541,17 +8575,17 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8614,37 +8648,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8662,29 +8696,29 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8694,37 +8728,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8736,31 +8770,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8771,33 +8805,33 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8807,31 +8841,31 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8841,32 +8875,32 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12274,15 +12308,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12304,14 +12338,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12329,15 +12363,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12357,14 +12391,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12384,13 +12418,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12408,13 +12442,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12432,21 +12466,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -12510,15 +12544,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12540,14 +12574,14 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12565,15 +12599,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12594,21 +12628,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -12621,13 +12655,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12645,13 +12679,13 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12671,21 +12705,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12749,15 +12783,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12787,19 +12821,19 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX940-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB48_1 @@ -12818,20 +12852,20 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX11-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX11-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12845,21 +12879,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -12876,17 +12910,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 @@ -12903,17 +12937,17 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX908-NEXT: v_pk_max_f16 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -12928,21 +12962,21 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -13005,21 +13039,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13033,22 +13067,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13058,22 +13092,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13085,21 +13119,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -13110,20 +13144,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13133,20 +13167,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13156,24 +13190,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13232,21 +13266,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13260,22 +13294,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13285,22 +13319,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13314,21 +13348,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -13339,20 +13373,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13362,20 +13396,20 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13387,24 +13421,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13466,21 +13500,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13498,24 +13532,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13529,22 +13563,22 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[3:4] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13558,21 +13592,21 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13589,12 +13623,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -13616,12 +13650,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX908-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13639,24 +13673,24 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13719,15 +13753,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13750,14 +13784,14 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13775,15 +13809,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13804,21 +13838,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -13831,13 +13865,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13857,13 +13891,13 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13883,21 +13917,21 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13960,22 +13994,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13989,22 +14023,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14014,22 +14048,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14043,21 +14077,21 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -14068,22 +14102,22 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14093,20 +14127,20 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14118,24 +14152,24 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14202,38 +14236,36 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14250,35 +14282,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14291,41 +14323,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14340,34 +14370,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14381,33 +14411,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14421,33 +14451,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14461,34 +14491,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14548,38 +14578,36 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14596,35 +14624,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14637,41 +14665,39 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14687,35 +14713,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14728,33 +14754,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14768,33 +14794,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14810,34 +14836,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14897,38 +14923,36 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14953,35 +14977,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX940-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX940-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -15000,40 +15024,38 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_dual_max_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15048,35 +15070,35 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -15093,33 +15115,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15136,33 +15158,33 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15177,34 +15199,34 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15263,38 +15285,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15308,38 +15330,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15349,41 +15371,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15396,35 +15418,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15435,36 +15457,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15474,36 +15496,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15513,38 +15535,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] @@ -15598,38 +15620,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15643,38 +15665,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15684,41 +15706,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15733,35 +15755,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15772,36 +15794,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15811,36 +15833,36 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15852,37 +15874,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15940,38 +15962,38 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15989,40 +16011,40 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16036,41 +16058,41 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[3:4] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16085,35 +16107,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16130,28 +16152,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -16173,28 +16195,28 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -16212,37 +16234,37 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16301,39 +16323,37 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16350,35 +16370,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16391,41 +16411,39 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16441,35 +16459,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16482,35 +16500,35 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16524,33 +16542,33 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16566,34 +16584,34 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16652,39 +16670,39 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16698,38 +16716,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16739,41 +16757,41 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16788,35 +16806,35 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16827,38 +16845,38 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16868,36 +16886,36 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16909,37 +16927,37 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index d96d3db9f005d..daa3df680e5ca 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -31,13 +31,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -76,13 +76,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -100,13 +100,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -173,13 +173,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -220,13 +220,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -244,13 +244,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -329,18 +329,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB2_1 @@ -381,17 +381,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 @@ -408,17 +408,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB2_1 @@ -482,21 +482,21 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -528,20 +528,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -551,20 +551,20 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -622,21 +622,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -670,20 +670,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -693,20 +693,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -773,23 +773,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -831,12 +831,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -858,12 +858,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -935,13 +935,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -982,13 +982,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1008,13 +1008,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1084,21 +1084,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1132,22 +1132,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1157,20 +1157,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1235,13 +1235,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1259,15 +1259,14 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1287,14 +1286,14 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1314,13 +1313,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1338,13 +1337,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1427,13 +1426,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1472,13 +1471,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1496,13 +1495,13 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1573,13 +1572,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1618,13 +1617,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1642,13 +1641,13 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1715,13 +1714,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1762,13 +1761,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1786,13 +1785,13 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1871,18 +1870,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 @@ -1923,17 +1922,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 @@ -1950,17 +1949,17 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 -; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 @@ -2024,21 +2023,21 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2070,20 +2069,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2093,20 +2092,20 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2164,21 +2163,21 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2212,20 +2211,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2235,20 +2234,20 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2315,23 +2314,23 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2373,12 +2372,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2400,12 +2399,12 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX908-NEXT: v_max_f32_e32 v5, v1, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2477,13 +2476,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2524,13 +2523,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2550,13 +2549,13 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2626,21 +2625,21 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2674,22 +2673,22 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2699,20 +2698,20 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2770,29 +2769,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -2801,24 +2800,25 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB18_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB18_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2865,29 +2865,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -2895,22 +2895,23 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB18_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB18_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -3000,90 +3001,91 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB18_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: .LBB18_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB18_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB18_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB18_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB18_6 @@ -3091,17 +3093,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -3159,7 +3162,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3186,9 +3188,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3207,6 +3210,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB19_2 ; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3262,7 +3266,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3285,9 +3288,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3306,6 +3310,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB19_2 ; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3403,7 +3408,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3426,8 +3430,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3443,6 +3448,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB19_2 ; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3458,7 +3464,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 @@ -3486,8 +3491,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3503,6 +3509,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB19_2 ; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3571,7 +3578,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3598,9 +3604,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[6:7], v[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3619,6 +3626,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execz .LBB20_2 ; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3675,7 +3683,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base @@ -3698,9 +3705,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3719,6 +3727,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3816,7 +3825,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base @@ -3839,8 +3847,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 ; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3856,6 +3865,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_cbranch_execz .LBB20_2 ; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 @@ -3871,7 +3881,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -3899,8 +3908,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3916,6 +3926,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_cbranch_execz .LBB20_2 ; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 ; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen @@ -3984,7 +3995,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4001,20 +4011,21 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4023,18 +4034,19 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB21_2 ; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4080,7 +4092,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 @@ -4093,21 +4104,22 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -4115,18 +4127,19 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB21_2 ; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4212,7 +4225,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4225,38 +4237,40 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB21_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB21_2 ; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4264,7 +4278,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4279,42 +4292,44 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[2:3] -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB21_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4370,14 +4385,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB22_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4390,20 +4404,21 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4411,19 +4426,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB22_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB22_2 ; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4471,13 +4487,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4487,40 +4502,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB22_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4610,11 +4627,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_3 @@ -4625,38 +4641,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB22_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB22_2 ; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -4664,13 +4682,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_3 @@ -4681,42 +4698,44 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB22_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4775,14 +4794,13 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB23_3 ; GFX12-NEXT: ; %bb.1: ; %Flow2 @@ -4795,20 +4813,21 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4816,19 +4835,20 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_cbranch_execnz .LBB23_4 ; GFX12-NEXT: ; %bb.5: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB23_2 ; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -4877,13 +4897,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB23_3 ; GFX11-NEXT: ; %bb.1: ; %Flow2 @@ -4893,40 +4912,42 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_cbranch_execnz .LBB23_4 ; GFX11-NEXT: ; %bb.5: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB23_2 ; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -5016,11 +5037,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_3 @@ -5031,38 +5051,40 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB23_4 ; GFX908-NEXT: ; %bb.5: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB23_2 ; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5070,13 +5092,12 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_3 @@ -5087,42 +5108,44 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB23_4 ; GFX8-NEXT: ; %bb.5: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB23_2 ; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private -; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 -; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5181,29 +5204,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5212,24 +5235,25 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB24_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB24_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5276,29 +5300,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5306,76 +5330,78 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB24_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB24_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_4 ; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_2 ; GFX10-NEXT: ; %bb.3: ; %Flow ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: .LBB24_4: ; %Flow2 ; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 ; GFX10-NEXT: s_cbranch_execz .LBB24_6 ; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX10-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: @@ -5383,138 +5409,140 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB24_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB24_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: .LBB24_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB24_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB24_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB24_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB24_6 @@ -5522,17 +5550,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5541,37 +5570,37 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_4 ; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[2:3] -; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] +; GFX7-NEXT: flat_load_dword v4, v[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB24_2 ; GFX7-NEXT: ; %bb.3: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7-NEXT: .LBB24_4: ; %Flow2 ; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_cbranch_execz .LBB24_6 @@ -5579,17 +5608,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 -; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX7-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 @@ -5604,29 +5634,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_4 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] -; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5635,24 +5665,25 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: ; %bb.3: ; %Flow ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX12-NEXT: .LBB25_4: ; %Flow2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX12-NEXT: s_cbranch_execz .LBB25_6 ; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -5699,29 +5730,29 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_4 ; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5729,22 +5760,23 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX11-NEXT: ; %bb.3: ; %Flow ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX11-NEXT: .LBB25_4: ; %Flow2 ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-NEXT: s_cbranch_execz .LBB25_6 ; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo -; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -5834,90 +5866,91 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_4 ; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v3 -; GFX908-NEXT: v_mov_b32_e32 v8, v2 -; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB25_2 ; GFX908-NEXT: ; %bb.3: ; %Flow ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: .LBB25_4: ; %Flow2 ; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX908-NEXT: s_cbranch_execz .LBB25_6 ; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 -; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v2 -; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_4 ; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB25_2 ; GFX8-NEXT: ; %bb.3: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: .LBB25_4: ; %Flow2 ; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX8-NEXT: s_cbranch_execz .LBB25_6 @@ -5925,17 +5958,18 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 -; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5998,9 +6032,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] @@ -6012,11 +6045,12 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6049,14 +6083,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6076,9 +6110,8 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] @@ -6090,11 +6123,12 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6118,7 +6152,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -6130,9 +6163,10 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6161,14 +6195,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6195,14 +6229,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6229,17 +6263,17 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6302,11 +6336,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6316,11 +6349,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6355,14 +6389,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6383,11 +6417,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6397,11 +6430,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6426,10 +6460,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6438,9 +6471,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6470,14 +6504,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6505,14 +6539,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6540,17 +6574,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6615,11 +6649,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6629,11 +6662,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6669,14 +6703,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -6697,11 +6731,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6711,11 +6744,12 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -6740,10 +6774,9 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -6752,9 +6785,10 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6784,14 +6818,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc @@ -6819,14 +6853,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -6854,17 +6888,17 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6928,9 +6962,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: flat_load_b32 v4, v[0:1] @@ -6942,9 +6975,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -6978,13 +7012,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -7004,9 +7038,8 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] @@ -7018,9 +7051,10 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -7045,7 +7079,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -7057,8 +7090,9 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7087,13 +7121,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc @@ -7120,13 +7154,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -7153,16 +7187,16 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7221,36 +7255,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7268,29 +7302,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7300,37 +7334,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7342,31 +7376,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -7377,31 +7411,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7411,31 +7445,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7445,32 +7479,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7525,36 +7559,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7573,29 +7607,29 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7605,37 +7639,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7647,31 +7681,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -7682,31 +7716,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7716,31 +7750,31 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7750,32 +7784,32 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7831,15 +7865,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7864,14 +7898,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX940-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 @@ -7890,15 +7924,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -7922,23 +7956,23 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX10-NEXT: v_min_f16_e32 v0, v5, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 @@ -7951,14 +7985,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7977,14 +8011,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8005,19 +8039,19 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -8067,24 +8101,24 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8098,23 +8132,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8124,25 +8158,25 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8156,23 +8190,23 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 @@ -8183,22 +8217,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8208,22 +8242,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8235,22 +8269,22 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8300,11 +8334,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8314,11 +8347,12 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8354,14 +8388,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -8382,11 +8416,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8396,11 +8429,12 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -8425,10 +8459,9 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v5, v[0:1] ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -8437,9 +8470,10 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8469,14 +8503,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -8506,14 +8540,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc @@ -8541,17 +8575,17 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8614,37 +8648,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -8662,29 +8696,29 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8694,37 +8728,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8736,31 +8770,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -8771,33 +8805,33 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: flat_load_dword v3, v[0:1] -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8807,31 +8841,31 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: flat_load_dword v3, v[0:1] -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8841,32 +8875,32 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12274,15 +12308,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12304,14 +12338,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12329,15 +12363,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12357,14 +12391,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12384,13 +12418,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12408,13 +12442,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12432,21 +12466,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -12510,15 +12544,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12540,14 +12574,14 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12565,15 +12599,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12594,21 +12628,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB47_1 @@ -12621,13 +12655,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12645,13 +12679,13 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12671,21 +12705,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12749,15 +12783,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12787,19 +12821,19 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_pk_max_f16 v0, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX940-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB48_1 @@ -12818,20 +12852,20 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX11-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX11-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12845,21 +12879,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB48_1 @@ -12876,17 +12910,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 @@ -12903,17 +12937,17 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX908-NEXT: v_pk_min_f16 v5, v0, v1 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX908-NEXT: v_pk_min_f16 v0, v0, v5 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB48_1 @@ -12928,21 +12962,21 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -13005,21 +13039,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13033,22 +13067,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13058,22 +13092,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13085,21 +13119,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -13110,20 +13144,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13133,20 +13167,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13156,24 +13190,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13232,21 +13266,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13260,22 +13294,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13285,22 +13319,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13314,21 +13348,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -13339,20 +13373,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13362,20 +13396,20 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13387,24 +13421,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13466,21 +13500,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13498,24 +13532,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13529,22 +13563,22 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[3:4] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13558,21 +13592,21 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13589,12 +13623,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v3, v0 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -13616,12 +13650,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v0, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 -; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX908-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX908-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13639,24 +13673,24 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13719,15 +13753,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13750,14 +13784,14 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13775,15 +13809,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13804,21 +13838,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v1, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -13831,13 +13865,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13857,13 +13891,13 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13883,21 +13917,21 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13960,22 +13994,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13989,22 +14023,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14014,22 +14048,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14043,21 +14077,21 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -14068,22 +14102,22 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14093,20 +14127,20 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14118,24 +14152,24 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14202,38 +14236,36 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14250,35 +14282,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14291,41 +14323,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14340,34 +14370,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14381,33 +14411,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14421,33 +14451,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14461,34 +14491,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14548,38 +14578,36 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14596,35 +14624,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14637,41 +14665,39 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14687,35 +14713,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14728,33 +14754,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14768,33 +14794,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14810,34 +14836,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14897,38 +14923,36 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14953,35 +14977,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX940-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX940-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -15000,40 +15024,38 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_dual_min_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15048,35 +15070,35 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -15093,33 +15115,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15136,33 +15158,33 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15177,34 +15199,34 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15263,38 +15285,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15308,38 +15330,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15349,41 +15371,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15396,35 +15418,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15435,36 +15457,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15474,36 +15496,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15513,38 +15535,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] @@ -15598,38 +15620,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15643,38 +15665,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15684,41 +15706,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15733,35 +15755,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15772,36 +15794,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15811,36 +15833,36 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15852,37 +15874,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15940,38 +15962,38 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15989,40 +16011,40 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16036,41 +16058,41 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[3:4] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16085,35 +16107,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16130,28 +16152,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -16173,28 +16195,28 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_min_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -16212,37 +16234,37 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16301,39 +16323,37 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16350,35 +16370,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16391,41 +16411,39 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16441,35 +16459,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16482,35 +16500,35 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16524,33 +16542,33 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16566,34 +16584,34 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16652,39 +16670,39 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16698,38 +16716,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16739,41 +16757,41 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16788,35 +16806,35 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16827,38 +16845,38 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16868,36 +16886,36 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16909,37 +16927,37 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 14f75814128f1..1b56fd020d502 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -13729,38 +13729,36 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13777,35 +13775,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 @@ -13818,41 +13816,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -13867,34 +13863,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -13908,33 +13904,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 @@ -13948,33 +13944,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 @@ -13988,34 +13984,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 @@ -14075,38 +14071,36 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14123,35 +14117,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 @@ -14164,41 +14158,39 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14214,35 +14206,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -14255,33 +14247,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 @@ -14295,33 +14287,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 @@ -14337,34 +14329,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 @@ -14424,38 +14416,36 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14480,35 +14470,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX940-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX940-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB52_1 @@ -14527,40 +14517,38 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 ; GFX11-NEXT: flat_load_b32 v0, v[4:5] ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_lshlrev_b32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_dual_sub_f32 v0, v6, v0 :: v_dual_and_b32 v7, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14575,35 +14563,35 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -14620,33 +14608,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX90A-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 @@ -14663,33 +14651,33 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v0, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 -; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB52_1 @@ -14704,34 +14692,34 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -14790,38 +14778,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14835,38 +14823,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: flat_load_dword v5, v[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14876,41 +14864,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14923,35 +14911,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dword v3, v[0:1] -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -14962,36 +14950,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15001,36 +14989,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: flat_load_dword v4, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15040,37 +15028,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX8-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15125,38 +15113,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15170,38 +15158,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15211,41 +15199,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15260,35 +15248,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -15299,36 +15287,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15338,36 +15326,36 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX908-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15379,37 +15367,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15467,38 +15455,38 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15516,40 +15504,40 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s0, 0xf800 ; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: flat_load_dword v5, v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15563,41 +15551,41 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[3:4] ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15612,35 +15600,35 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -15657,28 +15645,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v3, v0 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s9 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -15700,28 +15688,28 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: flat_load_dword v1, v[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX908-NEXT: v_sub_f32_e32 v0, v0, v5 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX908-NEXT: v_sub_f32_e32 v0, v5, v0 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 ; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -15739,37 +15727,37 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15828,39 +15816,37 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15877,35 +15863,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -15918,41 +15904,39 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15968,35 +15952,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: flat_load_dword v0, v[3:4] ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX10-NEXT: v_sub_f32_e32 v0, v6, v0 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -16009,35 +15993,35 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -16051,33 +16035,33 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -16093,34 +16077,34 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -16179,39 +16163,39 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16225,38 +16209,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16266,41 +16250,41 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16315,35 +16299,35 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: flat_load_dword v4, v[0:1] ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -16354,38 +16338,38 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX90A-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16395,36 +16379,36 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX908-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16436,37 +16420,37 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 1311560715ddd..61549fa88f424 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -2057,19 +2057,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN1-NEXT: v_not_b32_e32 v2, v2 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2081,19 +2083,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN2-NEXT: v_not_b32_e32 v2, v2 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2105,19 +2109,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN3-NEXT: v_not_b32_e32 v2, v2 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2135,23 +2141,25 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN1-NEXT: v_not_b32_e32 v2, v2 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2161,23 +2169,25 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN2-NEXT: v_not_b32_e32 v2, v2 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar: @@ -2185,19 +2195,21 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 -; GCN3-NEXT: v_not_b32_e32 v2, v2 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2215,19 +2227,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN1-NEXT: v_not_b32_e32 v3, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB56_1 @@ -2241,19 +2253,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN2-NEXT: v_not_b32_e32 v3, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB56_1 @@ -2267,19 +2279,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN3-NEXT: v_not_b32_e32 v3, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB56_1 @@ -2296,25 +2308,27 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN1-NEXT: v_not_b32_e32 v3, v0 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2322,25 +2336,27 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN2-NEXT: v_not_b32_e32 v3, v0 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar: @@ -2349,19 +2365,19 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN3-NEXT: v_not_b32_e32 v3, v0 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_not_b32_e32 v0, v0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB57_1 @@ -3516,18 +3532,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3539,18 +3557,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3562,18 +3582,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3591,22 +3613,24 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3616,22 +3640,24 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar: @@ -3639,18 +3665,20 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3668,18 +3696,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB86_1 @@ -3693,18 +3721,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB86_1 @@ -3718,18 +3746,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB86_1 @@ -3746,24 +3774,26 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3771,24 +3801,26 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar: @@ -3797,18 +3829,18 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB87_1 @@ -3834,19 +3866,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -3864,19 +3898,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -3892,19 +3928,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -3930,25 +3968,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: @@ -3965,25 +4005,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: @@ -3998,25 +4040,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -4038,19 +4082,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -4066,19 +4112,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -4094,19 +4142,21 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_max_i32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -4129,25 +4179,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: @@ -4162,25 +4214,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: @@ -4195,25 +4249,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -4659,18 +4715,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4682,18 +4740,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4705,18 +4765,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4734,22 +4796,24 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4759,22 +4823,24 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar: @@ -4782,18 +4848,20 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4811,18 +4879,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB100_1 @@ -4836,18 +4904,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB100_1 @@ -4861,18 +4929,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB100_1 @@ -4889,24 +4957,26 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -4914,24 +4984,26 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar: @@ -4940,18 +5012,18 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_max_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB101_1 @@ -4977,19 +5049,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -5007,19 +5081,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -5035,19 +5111,21 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_max_u32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -5073,25 +5151,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -5108,25 +5188,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -5141,25 +5223,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5182,25 +5266,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: @@ -5215,25 +5301,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: @@ -5248,25 +5336,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_max_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -5712,18 +5802,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5735,18 +5827,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5758,18 +5852,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5787,22 +5883,24 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5812,22 +5910,24 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar: @@ -5835,18 +5935,20 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5864,18 +5966,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB113_1 @@ -5889,18 +5991,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB113_1 @@ -5914,18 +6016,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB113_1 @@ -5942,24 +6044,26 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -5967,24 +6071,26 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar: @@ -5993,18 +6099,18 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_u32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_u32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB114_1 @@ -6453,18 +6559,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: v_mov_b32_e32 v1, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6476,18 +6584,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: v_mov_b32_e32 v1, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6499,18 +6609,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6528,22 +6640,24 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6553,22 +6667,24 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar: @@ -6576,18 +6692,20 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_mov_b32_e32 v1, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6605,18 +6723,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB123_1 @@ -6630,18 +6748,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB123_1 @@ -6655,18 +6773,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB123_1 @@ -6683,24 +6801,26 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v1, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s35 -; GCN1-NEXT: flat_load_dword v0, v[1:2] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6708,24 +6828,26 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v1, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s35 -; GCN2-NEXT: flat_load_dword v0, v[1:2] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar: @@ -6734,18 +6856,18 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v4, v0 -; GCN3-NEXT: v_min_i32_e32 v3, s6, v4 -; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB124_1 @@ -6771,19 +6893,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm @@ -6801,19 +6925,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm @@ -6829,19 +6955,21 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_min_i32_e32 v0, s2, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -6867,25 +6995,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: @@ -6902,25 +7032,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: @@ -6935,25 +7067,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index @@ -6966,72 +7100,78 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0xb +; GCN1-NEXT: s_mov_b64 s[2:3], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_min_i32_e32 v0, s4, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x2c +; GCN2-NEXT: s_mov_b64 s[2:3], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_min_i32_e32 v0, s4, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN3-NEXT: s_load_dword s6, s[4:5], 0x2c +; GCN3-NEXT: s_mov_b64 s[2:3], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dword v1, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm @@ -7053,25 +7193,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: s_mov_b64 s[0:1], 0 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v2 -; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: flat_store_dword v[0:1], v2 +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s3 +; GCN1-NEXT: flat_store_dword v[1:2], v0 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: @@ -7086,25 +7228,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: s_mov_b64 s[0:1], 0 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v2 -; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s3 +; GCN2-NEXT: flat_store_dword v[1:2], v0 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: @@ -7119,25 +7263,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[0:1], 0 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_min_i32_e32 v0, s6, v1 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: v_mov_b32_e32 v2, s3 +; GCN3-NEXT: flat_store_dword v[1:2], v0 ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr %out, i32 %index diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 36bddb7ac2fd6..66ac4d2198ea5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -6998,6 +6998,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: v_not_b32_e32 v1, v0 ; GCN1-NEXT: v_not_b32_e32 v0, v6 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7063,6 +7065,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: v_not_b32_e32 v1, v0 ; GCN2-NEXT: v_not_b32_e32 v0, v6 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7111,15 +7115,17 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: v_not_b32_e32 v1, v0 ; GCN3-NEXT: v_not_b32_e32 v0, v6 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7188,6 +7194,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: v_not_b32_e32 v1, v0 ; GCN1-NEXT: v_not_b32_e32 v0, v6 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7255,6 +7263,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: v_not_b32_e32 v1, v0 ; GCN2-NEXT: v_not_b32_e32 v0, v6 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7305,15 +7315,17 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB55_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB55_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB55_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 ; GCN3-NEXT: v_not_b32_e32 v1, v0 ; GCN3-NEXT: v_not_b32_e32 v0, v6 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -7373,16 +7385,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_not_b32_e32 v1, v0 +; GCN1-NEXT: v_not_b32_e32 v0, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB56_2 @@ -7436,16 +7450,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_not_b32_e32 v1, v0 +; GCN2-NEXT: v_not_b32_e32 v0, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB56_2 @@ -7484,23 +7500,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB56_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: .LBB56_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN3-NEXT: v_not_b32_e32 v5, v0 -; GCN3-NEXT: v_not_b32_e32 v4, v1 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_not_b32_e32 v1, v0 +; GCN3-NEXT: v_not_b32_e32 v0, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB56_2 @@ -7557,16 +7575,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_not_b32_e32 v1, v0 +; GCN1-NEXT: v_not_b32_e32 v0, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB57_2 @@ -7622,16 +7642,18 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_not_b32_e32 v1, v0 +; GCN2-NEXT: v_not_b32_e32 v0, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB57_2 @@ -7672,23 +7694,25 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB57_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: .LBB57_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 -; GCN3-NEXT: v_mov_b32_e32 v6, v0 -; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 -; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 -; GCN3-NEXT: v_not_b32_e32 v5, v0 -; GCN3-NEXT: v_not_b32_e32 v4, v1 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_not_b32_e32 v1, v0 +; GCN3-NEXT: v_not_b32_e32 v0, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB57_2 @@ -11422,14 +11446,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11488,14 +11514,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11542,18 +11570,20 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB84_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB84_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB84_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -11615,14 +11645,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -11683,14 +11715,16 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -11739,18 +11773,20 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB85_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB85_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB85_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -11805,20 +11841,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB86_2 @@ -11869,20 +11907,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB86_2 @@ -11921,24 +11961,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB86_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB86_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB86_2 @@ -11992,20 +12034,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB87_2 @@ -12058,20 +12102,22 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB87_2 @@ -12112,24 +12158,26 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB87_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB87_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB87_2 @@ -12190,18 +12238,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -12261,18 +12311,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -12331,18 +12383,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: .LBB88_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB88_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -12399,24 +12453,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB89_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB89_2 @@ -12470,24 +12526,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB89_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB89_2 @@ -12540,24 +12598,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB89_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB89_2 @@ -12621,18 +12681,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -12690,18 +12752,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -12758,18 +12822,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -12823,24 +12889,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB91_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB91_2 @@ -12892,24 +12960,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB91_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB91_2 @@ -12960,24 +13030,26 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB91_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB91_2 @@ -14125,14 +14197,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14191,14 +14265,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14245,18 +14321,20 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB98_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB98_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB98_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -14318,14 +14396,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14386,14 +14466,16 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -14442,18 +14524,20 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB99_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB99_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB99_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -14508,20 +14592,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB100_2 @@ -14572,20 +14658,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB100_2 @@ -14624,24 +14712,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB100_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB100_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB100_2 @@ -14695,20 +14785,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB101_2 @@ -14761,20 +14853,22 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB101_2 @@ -14815,24 +14909,26 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB101_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB101_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB101_2 @@ -14893,18 +14989,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -14964,18 +15062,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15034,18 +15134,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: .LBB102_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB102_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -15102,24 +15204,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB103_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB103_2 @@ -15173,24 +15277,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB103_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB103_2 @@ -15243,24 +15349,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB103_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB103_2 @@ -15317,24 +15425,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB104_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB104_2 @@ -15386,24 +15496,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB104_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB104_2 @@ -15454,24 +15566,26 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB104_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB104_2 @@ -16619,14 +16733,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -16685,14 +16801,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -16739,18 +16857,20 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: .LBB111_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB111_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB111_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -16812,14 +16932,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -16880,14 +17002,16 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -16936,18 +17060,20 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB112_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB112_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB112_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -17002,20 +17128,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB113_2 @@ -17066,20 +17194,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB113_2 @@ -17118,24 +17248,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB113_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB113_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB113_2 @@ -17189,20 +17321,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB114_2 @@ -17255,20 +17389,22 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB114_2 @@ -17309,24 +17445,26 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB114_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB114_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB114_2 @@ -18470,14 +18608,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -18536,14 +18676,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -18590,18 +18732,20 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: .LBB121_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB121_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB121_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -18663,14 +18807,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s7 -; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -18731,14 +18877,16 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s7 -; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -18787,18 +18935,20 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: .LBB122_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB122_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s34 -; GCN3-NEXT: v_mov_b32_e32 v5, s35 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s7 -; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB122_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -18853,20 +19003,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_cbranch_execnz .LBB123_2 @@ -18917,20 +19069,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_cbranch_execnz .LBB123_2 @@ -18969,24 +19123,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB123_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB123_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_cbranch_execnz .LBB123_2 @@ -19040,20 +19196,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s7 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB124_2 @@ -19106,20 +19264,22 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 -; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB124_2 @@ -19160,24 +19320,26 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_cbranch_vccz .LBB124_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: v_mov_b32_e32 v3, s35 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[36:37], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s7 -; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB124_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB124_2 @@ -19238,18 +19400,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -19309,18 +19473,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -19379,18 +19545,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: .LBB125_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB125_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -19447,24 +19615,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB126_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB126_2 @@ -19518,24 +19688,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB126_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB126_2 @@ -19588,24 +19760,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB126_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB126_2 @@ -19665,18 +19839,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v6, s3 -; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN1-NEXT: v_mov_b32_e32 v0, s3 +; GCN1-NEXT: v_mov_b32_e32 v6, s2 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -19730,18 +19906,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: v_mov_b32_e32 v6, s3 -; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN2-NEXT: v_mov_b32_e32 v0, s3 +; GCN2-NEXT: v_mov_b32_e32 v6, s2 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -19794,18 +19972,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB127_2: ; %atomicrmw.phi ; GCN3-NEXT: s_endpgm ; GCN3-NEXT: .LBB127_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v5, s1 -; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: v_mov_b32_e32 v6, s3 -; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GCN3-NEXT: v_mov_b32_e32 v0, s3 +; GCN3-NEXT: v_mov_b32_e32 v6, s2 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -19858,24 +20038,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB128_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN1-NEXT: s_mov_b64 s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s13 -; GCN1-NEXT: v_mov_b32_e32 v5, s12 ; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 -; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s13 +; GCN1-NEXT: v_mov_b32_e32 v6, s12 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN1-NEXT: s_cbranch_execnz .LBB128_2 @@ -19927,24 +20109,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB128_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN2-NEXT: s_mov_b64 s[2:3], 0 -; GCN2-NEXT: v_mov_b32_e32 v4, s13 -; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 -; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s13 +; GCN2-NEXT: v_mov_b32_e32 v6, s12 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN2-NEXT: s_cbranch_execnz .LBB128_2 @@ -19995,24 +20179,26 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN3-NEXT: s_cbranch_vccz .LBB128_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v3, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GCN3-NEXT: s_mov_b64 s[2:3], 0 -; GCN3-NEXT: v_mov_b32_e32 v4, s13 -; GCN3-NEXT: v_mov_b32_e32 v5, s12 ; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 -; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v0, s13 +; GCN3-NEXT: v_mov_b32_e32 v6, s12 +; GCN3-NEXT: v_mov_b32_e32 v5, s1 +; GCN3-NEXT: v_mov_b32_e32 v4, s0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN3-NEXT: s_cbranch_execnz .LBB128_2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index fe47461ebf956..4a4fcfe5afcc8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -2187,14 +2187,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_not_b32_e32 v1, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2221,14 +2221,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v6 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2250,14 +2250,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v6 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2290,12 +2290,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: v_not_b32_e32 v1, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2303,12 +2305,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2324,12 +2326,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v6 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2337,12 +2341,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar: @@ -2351,14 +2355,14 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v6 ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc @@ -2390,22 +2394,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX7-NEXT: v_not_b32_e32 v5, v0 -; GFX7-NEXT: v_not_b32_e32 v4, v1 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB56_1 @@ -2424,22 +2428,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v0 -; GFX8-NEXT: v_not_b32_e32 v4, v1 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -2453,22 +2457,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v0 -; GFX9-NEXT: v_not_b32_e32 v4, v1 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB56_1 @@ -2493,25 +2497,27 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX7-NEXT: v_not_b32_e32 v5, v0 -; GFX7-NEXT: v_not_b32_e32 v4, v1 -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_not_b32_e32 v1, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2527,25 +2533,27 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX8-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v0 -; GFX8-NEXT: v_not_b32_e32 v4, v1 -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar: @@ -2554,22 +2562,22 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v0 -; GFX9-NEXT: v_not_b32_e32 v4, v1 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB57_1 @@ -3849,17 +3857,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3884,17 +3892,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -3914,17 +3922,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3955,26 +3963,28 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB85_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -3990,26 +4000,28 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar: @@ -4018,17 +4030,17 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4058,23 +4070,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB86_1 @@ -4093,23 +4105,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB86_1 @@ -4123,23 +4135,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB86_1 @@ -4164,26 +4176,28 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB87_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4199,26 +4213,28 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar: @@ -4227,23 +4243,23 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 @@ -4266,26 +4282,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB88_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4300,26 +4318,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB88_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4332,26 +4352,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4374,30 +4396,32 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX7-NEXT: s_cbranch_execnz .LBB89_1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB89_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64_offset: @@ -4411,65 +4435,69 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4488,26 +4516,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB90_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -4520,26 +4550,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB90_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -4552,26 +4584,28 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4591,30 +4625,32 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB91_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_max_i64_ret_addr64: @@ -4626,65 +4662,69 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5231,17 +5271,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -5266,17 +5306,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -5296,17 +5336,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5337,26 +5377,28 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB99_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5372,26 +5414,28 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB99_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar: @@ -5400,17 +5444,17 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5440,23 +5484,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB100_1 @@ -5475,23 +5519,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB100_1 @@ -5505,23 +5549,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB100_1 @@ -5546,26 +5590,28 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB101_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5581,26 +5627,28 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB101_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar: @@ -5609,23 +5657,23 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 @@ -5648,26 +5696,28 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB102_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -5682,26 +5732,28 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB102_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -5714,26 +5766,28 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5756,30 +5810,32 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB103_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset: @@ -5793,65 +5849,69 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB103_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -5871,30 +5931,32 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB104_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_umax_i64_ret_addr64: @@ -5906,65 +5968,69 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB104_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6511,17 +6577,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6546,17 +6612,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -6576,17 +6642,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6617,26 +6683,28 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB112_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6652,26 +6720,28 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB112_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar: @@ -6680,17 +6750,17 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6720,23 +6790,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB113_1 @@ -6755,23 +6825,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB113_1 @@ -6785,23 +6855,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB113_1 @@ -6826,26 +6896,28 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB114_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6861,26 +6933,28 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB114_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar: @@ -6889,23 +6963,23 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 @@ -7455,17 +7529,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -7490,17 +7564,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -7520,17 +7594,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7561,26 +7635,28 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: v_mov_b32_e32 v5, s35 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: flat_load_dword v2, v[4:5] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB122_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7596,26 +7672,28 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: flat_load_dword v2, v[4:5] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB122_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar: @@ -7624,17 +7702,17 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7664,23 +7742,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_cbranch_execnz .LBB123_1 @@ -7699,23 +7777,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_cbranch_execnz .LBB123_1 @@ -7729,23 +7807,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB123_1 @@ -7770,26 +7848,28 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[2:3] -; GFX7-NEXT: s_mov_b64 s[34:35], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: s_mov_b64 s[36:37], 0 ; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 -; GFX7-NEXT: v_mov_b32_e32 v8, v0 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_cbranch_execnz .LBB124_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7805,26 +7885,28 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[2:3] -; GFX8-NEXT: s_mov_b64 s[34:35], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_mov_b64 s[36:37], 0 ; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_cbranch_execnz .LBB124_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar: @@ -7833,23 +7915,23 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 @@ -7872,26 +7954,28 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB125_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm @@ -7906,26 +7990,28 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB125_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_endpgm @@ -7938,26 +8024,28 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -7980,30 +8068,32 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB126_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64_offset: @@ -8017,65 +8107,69 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB126_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -8094,16 +8188,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: v_mov_b32_e32 v7, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v6, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -8124,16 +8218,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol @@ -8154,16 +8248,16 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8190,30 +8284,32 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: s_addc_u32 s1, s1, s7 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v2 -; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB128_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_min_i64_ret_addr64: @@ -8225,65 +8321,69 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: s_addc_u32 s1, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, v2 -; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB128_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 4aec2ffead437..09230d21544ff 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1426,23 +1426,23 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index ec4ea232e661c..d0fc798e55b6e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -18501,41 +18501,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18550,34 +18548,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB78_1 @@ -18591,33 +18589,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 @@ -18631,33 +18629,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB78_1 @@ -18671,34 +18669,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB78_1 @@ -18829,41 +18827,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -18878,34 +18874,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB79_1 @@ -18919,33 +18915,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 @@ -18959,33 +18955,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB79_1 @@ -19001,34 +18997,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB79_1 @@ -19159,41 +19155,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19208,34 +19202,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB80_1 @@ -19249,33 +19243,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 @@ -19289,33 +19283,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB80_1 @@ -19331,34 +19325,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB80_1 @@ -19492,41 +19486,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19539,35 +19533,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB81_1 @@ -19578,36 +19572,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB81_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19617,36 +19611,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB81_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19656,37 +19650,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB81_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19810,41 +19804,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -19857,35 +19851,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB82_1 @@ -19896,36 +19890,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB82_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19935,36 +19929,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB82_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19976,37 +19970,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB82_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20131,41 +20125,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20178,35 +20172,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB83_1 @@ -20217,36 +20211,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB83_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20256,36 +20250,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB83_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20297,37 +20291,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB83_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20462,41 +20456,39 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20511,34 +20503,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB84_1 @@ -20552,35 +20544,35 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 @@ -20594,33 +20586,33 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB84_1 @@ -20636,34 +20628,34 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB84_1 @@ -20794,41 +20786,41 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -20841,35 +20833,35 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB85_1 @@ -20880,38 +20872,38 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20921,36 +20913,36 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB85_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -20962,37 +20954,37 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21118,41 +21110,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21167,34 +21157,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB86_1 @@ -21208,33 +21198,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 @@ -21248,33 +21238,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB86_1 @@ -21288,34 +21278,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB86_1 @@ -21445,41 +21435,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21492,35 +21482,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB87_1 @@ -21531,36 +21521,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21570,36 +21560,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB87_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21609,37 +21599,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -21764,41 +21754,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -21813,34 +21801,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB88_1 @@ -21854,33 +21842,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 @@ -21894,33 +21882,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB88_1 @@ -21934,34 +21922,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB88_1 @@ -22091,41 +22079,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22138,35 +22126,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB89_1 @@ -22177,36 +22165,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB89_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22216,36 +22204,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB89_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22255,37 +22243,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22410,41 +22398,39 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22459,34 +22445,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB90_1 @@ -22500,33 +22486,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 @@ -22540,33 +22526,33 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB90_1 @@ -22580,34 +22566,34 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_add_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB90_1 @@ -22737,41 +22723,41 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -22784,35 +22770,35 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB91_1 @@ -22823,36 +22809,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB91_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22862,36 +22848,36 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB91_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -22901,37 +22887,37 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -23170,34 +23156,34 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; ; GFX8-LABEL: infer_as_before_atomic: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB92_3 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s5 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_add_f32_e32 v2, v3, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_cbranch_execnz .LBB92_2 ; GFX8-NEXT: .LBB92_3: ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index e2fde562d36b1..d70159b0b0ac7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -32,13 +32,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -77,13 +77,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -101,13 +101,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -193,13 +193,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -238,13 +238,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -262,13 +262,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -356,13 +356,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -401,13 +401,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -425,13 +425,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -517,21 +517,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -561,20 +561,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -584,20 +584,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -672,21 +672,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -716,20 +716,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -739,20 +739,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -830,21 +830,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -874,20 +874,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -897,20 +897,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -991,13 +991,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1341,15 +1341,14 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1369,14 +1368,14 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1396,13 +1395,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1420,13 +1419,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1547,13 +1546,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1592,13 +1591,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1616,13 +1615,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1712,13 +1711,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1757,13 +1756,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1781,13 +1780,13 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1873,13 +1872,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1918,13 +1917,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1942,13 +1941,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2036,13 +2035,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2081,13 +2080,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2105,13 +2104,13 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2197,21 +2196,21 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2241,20 +2240,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2264,20 +2263,20 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2352,21 +2351,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2396,20 +2395,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2419,20 +2418,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2510,21 +2509,21 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2554,20 +2553,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2577,20 +2576,20 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2671,13 +2670,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2716,13 +2715,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2742,13 +2741,13 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2835,21 +2834,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2879,22 +2878,22 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2904,20 +2903,20 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2991,15 +2990,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3029,15 +3028,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3075,15 +3074,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3101,15 +3100,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3164,15 +3163,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3202,15 +3201,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3248,15 +3247,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3276,15 +3275,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3338,15 +3337,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3376,15 +3375,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3422,15 +3421,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3450,15 +3449,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3511,21 +3510,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3548,22 +3547,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3593,21 +3592,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3617,21 +3616,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3674,21 +3673,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3711,22 +3710,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3756,21 +3755,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3782,21 +3781,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3840,21 +3839,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3877,22 +3876,22 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3922,21 +3921,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3948,21 +3947,21 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4007,15 +4006,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4045,15 +4044,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4073,15 +4072,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4102,13 +4101,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -4126,15 +4125,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4152,15 +4151,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4178,26 +4177,28 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] +; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4212,13 +4213,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4226,13 +4228,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7] +; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 ; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4256,15 +4259,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4294,15 +4297,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4340,15 +4343,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4366,15 +4369,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4433,9 +4436,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4447,11 +4449,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4484,14 +4487,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4511,9 +4514,8 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off @@ -4525,11 +4527,12 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4553,7 +4556,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4565,9 +4567,10 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4596,14 +4599,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4630,14 +4633,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -4664,17 +4667,17 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4787,11 +4790,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4801,11 +4803,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4840,14 +4843,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4868,11 +4871,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4882,11 +4884,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4911,10 +4914,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4923,9 +4925,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4955,14 +4958,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4990,14 +4993,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5025,17 +5028,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5152,11 +5155,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5166,11 +5168,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5206,14 +5209,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5234,11 +5237,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5248,11 +5250,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5277,10 +5280,9 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5289,9 +5291,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5321,14 +5324,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5356,14 +5359,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5391,17 +5394,17 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5517,9 +5520,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5531,9 +5533,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5567,13 +5570,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5593,9 +5596,8 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v4, v[0:1], off @@ -5607,9 +5609,10 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5634,7 +5637,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -5646,8 +5648,9 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5676,13 +5679,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_max_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc @@ -5709,13 +5712,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -5742,16 +5745,16 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5859,36 +5862,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5906,29 +5909,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5938,37 +5941,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5980,31 +5983,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6015,31 +6018,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6049,31 +6052,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6083,32 +6086,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6213,36 +6216,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6261,29 +6264,29 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6293,37 +6296,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6335,31 +6338,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6370,31 +6373,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6404,31 +6407,31 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6438,32 +6441,32 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6569,15 +6572,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6602,14 +6605,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX940-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 @@ -6628,15 +6631,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6659,14 +6662,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6688,14 +6691,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_max_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6714,14 +6717,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6742,19 +6745,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -6844,24 +6847,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6875,23 +6878,23 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX940-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6901,25 +6904,25 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6931,24 +6934,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6958,22 +6961,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_max_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6983,22 +6986,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7010,22 +7013,22 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7114,11 +7117,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7128,11 +7130,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7168,14 +7171,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -7196,11 +7199,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7210,11 +7212,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7239,10 +7242,9 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7251,9 +7253,10 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7283,14 +7286,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_max_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -7320,14 +7323,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_max_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -7355,17 +7358,17 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_max_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7480,37 +7483,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7528,29 +7531,29 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7560,37 +7563,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7602,31 +7605,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -7637,33 +7640,33 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7673,31 +7676,31 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7707,32 +7710,32 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11681,15 +11684,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11711,14 +11714,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11736,15 +11739,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -11764,14 +11767,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11791,13 +11794,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -11815,13 +11818,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -11839,21 +11842,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -11974,15 +11977,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12004,14 +12007,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12029,15 +12032,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12057,14 +12060,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12084,13 +12087,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12108,13 +12111,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12134,21 +12137,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12269,15 +12272,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12299,14 +12302,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12324,15 +12327,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12352,14 +12355,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12379,13 +12382,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12403,13 +12406,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12429,21 +12432,21 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -12567,21 +12570,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12595,22 +12598,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12620,22 +12623,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12647,21 +12650,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -12672,20 +12675,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12695,20 +12698,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12718,24 +12721,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12849,21 +12852,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12877,22 +12880,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12902,22 +12905,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12929,21 +12932,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -12954,20 +12957,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12977,20 +12980,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13002,24 +13005,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13134,21 +13137,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13162,22 +13165,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13187,22 +13190,22 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13214,21 +13217,21 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13239,20 +13242,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13262,20 +13265,20 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13287,24 +13290,24 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13428,15 +13431,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13459,14 +13462,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -13484,15 +13487,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13512,14 +13515,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13539,13 +13542,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_max_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_max_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -13565,13 +13568,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13591,21 +13594,21 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13725,22 +13728,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_max_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13754,22 +13757,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13779,22 +13782,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13806,21 +13809,21 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_max_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -13831,22 +13834,22 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13856,20 +13859,20 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13881,24 +13884,24 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14018,38 +14021,36 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14066,35 +14067,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14107,41 +14108,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14156,34 +14155,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14197,33 +14196,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14237,33 +14236,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14277,34 +14276,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14417,38 +14416,36 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14465,35 +14462,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14506,41 +14503,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14555,34 +14550,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14596,33 +14591,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14636,33 +14631,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14678,34 +14673,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14818,38 +14813,36 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14866,35 +14859,35 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -14907,41 +14900,39 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14956,34 +14947,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -14997,33 +14988,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15037,33 +15028,33 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15079,34 +15070,34 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15222,38 +15213,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15267,38 +15258,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15308,41 +15299,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15355,35 +15346,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15394,36 +15385,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15433,36 +15424,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15472,37 +15463,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15608,38 +15599,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15653,38 +15644,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15694,41 +15685,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15741,35 +15732,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15780,36 +15771,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15819,36 +15810,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15860,37 +15851,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15997,38 +15988,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16042,38 +16033,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16083,41 +16074,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16130,35 +16121,35 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16169,36 +16160,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16208,36 +16199,36 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16249,37 +16240,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16395,39 +16386,37 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_max_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16444,35 +16433,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16485,41 +16474,39 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_max_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16534,34 +16521,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16575,35 +16562,35 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16617,33 +16604,33 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_max_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16659,34 +16646,34 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_max_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_max_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16798,39 +16785,39 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_max_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16844,38 +16831,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16885,41 +16872,41 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16932,35 +16919,35 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16971,38 +16958,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17012,36 +16999,36 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17053,37 +17040,37 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 903e80b15814f..ecea5fdecfcd0 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -32,13 +32,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -77,13 +77,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -101,13 +101,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -193,13 +193,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -238,13 +238,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -262,13 +262,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -356,13 +356,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -401,13 +401,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -425,13 +425,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -517,21 +517,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB3_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -561,20 +561,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -584,20 +584,20 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -672,21 +672,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB4_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -716,20 +716,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -739,20 +739,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -830,21 +830,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB5_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -874,20 +874,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -897,20 +897,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -991,13 +991,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1036,13 +1036,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -1062,13 +1062,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1155,21 +1155,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB7_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1199,22 +1199,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1224,20 +1224,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1317,13 +1317,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1341,15 +1341,14 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_max_f32 v3, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1369,14 +1368,14 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1396,13 +1395,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1420,13 +1419,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1547,13 +1546,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1592,13 +1591,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1616,13 +1615,13 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1712,13 +1711,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1757,13 +1756,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1781,13 +1780,13 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1873,13 +1872,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1918,13 +1917,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -1942,13 +1941,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2036,13 +2035,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2081,13 +2080,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -2105,13 +2104,13 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2197,21 +2196,21 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2241,20 +2240,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2264,20 +2263,20 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2352,21 +2351,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2396,20 +2395,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2419,20 +2418,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2510,21 +2509,21 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2554,20 +2553,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2577,20 +2576,20 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2671,13 +2670,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2716,13 +2715,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -2742,13 +2741,13 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2835,21 +2834,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2879,22 +2878,22 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2904,20 +2903,20 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f32_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2991,15 +2990,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3029,15 +3028,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3075,15 +3074,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3101,15 +3100,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3164,15 +3163,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3202,15 +3201,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3248,15 +3247,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3276,15 +3275,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3338,15 +3337,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -3376,15 +3375,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3422,15 +3421,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3450,15 +3449,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[6:7] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3511,21 +3510,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3548,22 +3547,22 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3593,21 +3592,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3617,21 +3616,21 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3674,21 +3673,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3711,22 +3710,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3756,21 +3755,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3782,21 +3781,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3840,21 +3839,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3877,22 +3876,22 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -3922,21 +3921,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3948,21 +3947,21 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4007,15 +4006,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4045,15 +4044,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4073,15 +4072,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX10-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4102,13 +4101,13 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -4126,15 +4125,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4152,15 +4151,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4178,26 +4177,28 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v11, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] +; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4212,13 +4213,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 -; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4226,13 +4228,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v10, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] -; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7] +; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 ; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] @@ -4256,15 +4259,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[8:9], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -4294,15 +4297,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[8:9], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4340,15 +4343,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -4366,15 +4369,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] ; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[8:9] ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4433,9 +4436,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off @@ -4447,11 +4449,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4484,14 +4487,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4511,9 +4514,8 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off @@ -4525,11 +4527,12 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4553,7 +4556,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -4565,9 +4567,10 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4596,14 +4599,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4630,14 +4633,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -4664,17 +4667,17 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -4787,11 +4790,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4801,11 +4803,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4840,14 +4843,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -4868,11 +4871,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4882,11 +4884,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -4911,10 +4914,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -4923,9 +4925,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4955,14 +4958,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -4990,14 +4993,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5025,17 +5028,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5152,11 +5155,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5166,11 +5168,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5206,14 +5209,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5234,11 +5237,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5248,11 +5250,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -5277,10 +5280,9 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -5289,9 +5291,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5321,14 +5324,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc @@ -5356,14 +5359,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -5391,17 +5394,17 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5517,9 +5520,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-NEXT: global_load_b32 v4, v[0:1], off @@ -5531,9 +5533,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5567,13 +5570,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX940-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 @@ -5593,9 +5596,8 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-NEXT: global_load_b32 v4, v[0:1], off @@ -5607,9 +5609,10 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 @@ -5634,7 +5637,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 @@ -5646,8 +5648,9 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5676,13 +5679,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc @@ -5709,13 +5712,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -5742,16 +5745,16 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -5859,36 +5862,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5906,29 +5909,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5938,37 +5941,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -5980,31 +5983,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 @@ -6015,31 +6018,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6049,31 +6052,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6083,32 +6086,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6213,36 +6216,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6261,29 +6264,29 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6293,37 +6296,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6335,31 +6338,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1 @@ -6370,31 +6373,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6404,31 +6407,31 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6438,32 +6441,32 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6569,15 +6572,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 +; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6602,14 +6605,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX940-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 @@ -6628,15 +6631,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 @@ -6659,14 +6662,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6688,14 +6691,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX90A-NEXT: v_min_f16_e32 v3, v3, v4 ; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6714,14 +6717,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v5 ; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6742,19 +6745,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 @@ -6844,24 +6847,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_max_num_f16_e32 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-NEXT: v_max_num_f16_e32 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-NEXT: v_min_num_f16_e32 v3, v5, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6875,23 +6878,23 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 ; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX940-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6901,25 +6904,25 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-NEXT: v_max_f16_e32 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6931,24 +6934,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX10-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6958,22 +6961,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: v_max_f16_e32 v4, v5, v5 +; GFX90A-NEXT: v_min_f16_e32 v3, v4, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6983,22 +6986,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 ; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX908-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7010,22 +7013,22 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7114,11 +7117,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7128,11 +7130,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v7 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7168,14 +7171,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX940-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX940-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -7196,11 +7199,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7210,11 +7212,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 @@ -7239,10 +7242,9 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff @@ -7251,9 +7253,10 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_min_f16_e32 v5, v5, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7283,14 +7286,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v3, v7 +; GFX90A-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v6, v6, v6 +; GFX90A-NEXT: v_min_f16_e32 v5, v6, v5 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX90A-NEXT: buffer_wbl2 @@ -7320,14 +7323,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX908-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX908-NEXT: v_min_f16_e32 v5, v7, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc @@ -7355,17 +7358,17 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, v3, v6 +; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v7, v7 +; GFX8-NEXT: v_min_f16_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v8, v6, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7480,37 +7483,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: v_max_num_f16_e32 v7, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v7 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7528,29 +7531,29 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX940-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7560,37 +7563,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-NEXT: v_min_f16_e32 v3, v3, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7602,31 +7605,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX10-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB35_1 @@ -7637,33 +7640,33 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX90A-NEXT: v_min_f16_e32 v4, v4, v7 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7673,31 +7676,31 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX908-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7707,32 +7710,32 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_max_f16_e32 v7, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11681,15 +11684,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -11711,14 +11714,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -11736,15 +11739,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -11764,14 +11767,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11791,13 +11794,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -11815,13 +11818,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -11839,21 +11842,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v3, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 @@ -11974,15 +11977,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12004,14 +12007,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12029,15 +12032,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12057,14 +12060,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12084,13 +12087,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12108,13 +12111,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12134,21 +12137,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB47_1 @@ -12269,15 +12272,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -12299,14 +12302,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -12324,15 +12327,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -12352,14 +12355,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12379,13 +12382,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 @@ -12403,13 +12406,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -12429,21 +12432,21 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB48_1 @@ -12567,21 +12570,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12595,22 +12598,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12620,22 +12623,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12647,21 +12650,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB49_1 @@ -12672,20 +12675,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12695,20 +12698,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12718,24 +12721,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12849,21 +12852,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -12877,22 +12880,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12902,22 +12905,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12929,21 +12932,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -12954,20 +12957,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12977,20 +12980,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13002,24 +13005,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13134,21 +13137,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13162,22 +13165,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13187,22 +13190,22 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13214,21 +13217,21 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -13239,20 +13242,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13262,20 +13265,20 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13287,24 +13290,24 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13428,15 +13431,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -13459,14 +13462,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -13484,15 +13487,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -13512,14 +13515,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -13539,13 +13542,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 -; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 +; GFX90A-NEXT: v_pk_min_f16 v4, v3, v4 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -13565,13 +13568,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_max_f16 v5, v2, v2 ; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v3, v5 ; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -13591,21 +13594,21 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 -; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v7, v1, v1 +; GFX8-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -13725,22 +13728,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 +; GFX12-NEXT: v_pk_min_num_f16 v3, v5, v3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -13754,22 +13757,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_pk_max_f16 v4, v5, v5 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13779,22 +13782,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v5, v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX11-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13806,21 +13809,21 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX10-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX10-NEXT: v_pk_min_f16 v3, v5, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -13831,22 +13834,22 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13856,20 +13859,20 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_pk_max_f16 v5, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v3, v5, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13881,24 +13884,24 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_e32 v7, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v5, v7, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14018,38 +14021,36 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14066,35 +14067,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 @@ -14107,41 +14108,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14156,34 +14155,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -14197,33 +14196,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 @@ -14237,33 +14236,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 @@ -14277,34 +14276,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 @@ -14417,38 +14416,36 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14465,35 +14462,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 @@ -14506,41 +14503,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14555,34 +14550,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -14596,33 +14591,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 @@ -14636,33 +14631,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 @@ -14678,34 +14673,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 @@ -14818,38 +14813,36 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14866,35 +14859,35 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -14907,41 +14900,39 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14956,34 +14947,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -14997,33 +14988,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -15037,33 +15028,33 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -15079,34 +15070,34 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -15222,38 +15213,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15267,38 +15258,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15308,41 +15299,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15355,35 +15346,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -15394,36 +15385,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15433,36 +15424,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15472,37 +15463,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15608,38 +15599,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15653,38 +15644,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15694,41 +15685,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15741,35 +15732,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB58_1 @@ -15780,36 +15771,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15819,36 +15810,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15860,37 +15851,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15997,38 +15988,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16042,38 +16033,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16083,41 +16074,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16130,35 +16121,35 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB59_1 @@ -16169,36 +16160,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16208,36 +16199,36 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16249,37 +16240,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16395,39 +16386,37 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_min_num_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16444,35 +16433,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB60_1 @@ -16485,41 +16474,39 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_min_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16534,34 +16521,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB60_1 @@ -16575,35 +16562,35 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 @@ -16617,33 +16604,33 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_min_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB60_1 @@ -16659,34 +16646,34 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_min_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_min_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB60_1 @@ -16798,39 +16785,39 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_min_num_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16844,38 +16831,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16885,41 +16872,41 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16932,35 +16919,35 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB61_1 @@ -16971,38 +16958,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17012,36 +16999,36 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17053,37 +17040,37 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 3dbf6477a7cb8..88a7a5f4d9c6b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -14477,38 +14477,36 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14525,35 +14523,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB50_1 @@ -14566,41 +14564,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -14615,34 +14611,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB50_1 @@ -14656,33 +14652,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 @@ -14696,33 +14692,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB50_1 @@ -14736,34 +14732,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB50_1 @@ -14876,38 +14872,36 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -14924,35 +14918,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB51_1 @@ -14965,41 +14959,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15014,34 +15006,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB51_1 @@ -15055,33 +15047,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 @@ -15095,33 +15087,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB51_1 @@ -15137,34 +15129,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB51_1 @@ -15277,38 +15269,36 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15325,35 +15315,35 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB52_1 @@ -15366,41 +15356,39 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15415,34 +15403,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB52_1 @@ -15456,33 +15444,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 @@ -15496,33 +15484,33 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB52_1 @@ -15538,34 +15526,34 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB52_1 @@ -15681,38 +15669,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -15726,38 +15714,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v5, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15767,41 +15755,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -15814,35 +15802,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB53_1 @@ -15853,36 +15841,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15892,36 +15880,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v4, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15931,37 +15919,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16067,38 +16055,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16112,38 +16100,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16153,41 +16141,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16200,35 +16188,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB54_1 @@ -16239,36 +16227,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16278,36 +16266,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16319,37 +16307,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16456,38 +16444,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16501,38 +16489,38 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16542,41 +16530,41 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16589,35 +16577,35 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB55_1 @@ -16628,36 +16616,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16667,36 +16655,36 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16708,37 +16696,37 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -16854,39 +16842,37 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -16903,35 +16889,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB56_1 @@ -16944,41 +16930,39 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: v_dual_sub_f32 v5, v7, v5 :: v_dual_lshlrev_b32 v6, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -16993,34 +16977,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v6, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB56_1 @@ -17034,35 +17018,35 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 @@ -17076,33 +17060,33 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX908-NEXT: v_sub_f32_e32 v3, v7, v3 +; GFX908-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v6, v6, v5, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v3, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB56_1 @@ -17118,34 +17102,34 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX8-NEXT: v_sub_f32_e32 v5, v6, v5 +; GFX8-NEXT: v_sub_f32_e32 v0, v7, v0 +; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB56_1 @@ -17257,39 +17241,39 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX12-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -17303,38 +17287,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v4, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17344,41 +17328,41 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX11-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v8, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -17391,35 +17375,35 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX10-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX10-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB57_1 @@ -17430,38 +17414,38 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v7, v6 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17471,36 +17455,36 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX908-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v3, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17512,37 +17496,37 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index b9592a9ff9073..dde6671178af7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -56,19 +56,19 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1 ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_movk_i32 s2, 0x100 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: s_movk_i32 s1, 0x100 ; GCN-NEXT: .LBB1_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1] -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc -; GCN-NEXT: global_load_short_d16_hi v0, v[3:4], off glc +; GCN-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 ; GCN-NEXT: s_cbranch_vccz .LBB1_1 ; GCN-NEXT: ; %bb.2: ; %bb2 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index f7882e6f12022..10d6a54baf665 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -2528,19 +2528,21 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s6, v3 -; VI-NEXT: v_not_b32_e32 v2, v2 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2621,23 +2623,25 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s6, v3 -; VI-NEXT: v_not_b32_e32 v2, v2 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar: @@ -2714,19 +2718,19 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_and_b32_e32 v0, s6, v4 -; VI-NEXT: v_not_b32_e32 v3, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB57_1 @@ -2806,25 +2810,27 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB58_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_and_b32_e32 v0, s6, v4 -; VI-NEXT: v_not_b32_e32 v3, v0 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar: @@ -4298,18 +4304,20 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4388,22 +4396,24 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar: @@ -4478,18 +4488,18 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB89_1 @@ -4567,24 +4577,26 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar: @@ -4653,26 +4665,26 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_max_i32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -4759,32 +4771,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: @@ -4866,24 +4878,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB93_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_max_i32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -4969,30 +4981,30 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: @@ -5551,18 +5563,20 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5641,22 +5655,24 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar: @@ -5731,18 +5747,18 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB103_1 @@ -5820,24 +5836,26 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_max_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar: @@ -5906,26 +5924,26 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB105_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_u32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_max_u32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -6012,32 +6030,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB106_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: @@ -6127,30 +6145,30 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_max_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_max_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: @@ -6709,18 +6727,20 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6799,22 +6819,24 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB115_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_u32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar: @@ -6889,18 +6911,18 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB116_1 @@ -6978,24 +7000,26 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB117_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_min_u32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: @@ -7542,18 +7566,20 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7632,22 +7658,24 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar: @@ -7722,18 +7750,18 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB126_1 @@ -7811,24 +7839,26 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v1, s34 -; VI-NEXT: v_mov_b32_e32 v2, s35 -; VI-NEXT: flat_load_dword v0, v[1:2] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_i32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar: @@ -7897,26 +7927,26 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s3, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_min_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_min_i32_e32 v0, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -8003,32 +8033,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB129_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: @@ -8101,25 +8131,25 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; VI-NEXT: s_load_dword s2, s[4:5], 0x2c -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_load_dword s4, s[4:5], 0x2c +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s3, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_load_dword s5, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: .LBB130_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_min_i32_e32 v2, s2, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_min_i32_e32 v0, s4, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; VI-NEXT: s_cbranch_execnz .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -8200,30 +8230,30 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_ashr_i32 s5, s7, 31 ; VI-NEXT: s_mov_b32 s4, s7 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dword s7, s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: .LBB131_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s6, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_min_i32_e32 v0, s6, v1 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB131_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 59a99a6a0328d..645bb0b117ccb 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -2626,14 +2626,14 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 ; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_not_b32_e32 v1, v0 ; VI-NEXT: v_not_b32_e32 v0, v6 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2730,15 +2730,17 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 ; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: v_not_b32_e32 v1, v0 ; VI-NEXT: v_not_b32_e32 v0, v6 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -2746,12 +2748,12 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar: @@ -2837,22 +2839,22 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_and_b32_e32 v0, s7, v7 -; VI-NEXT: v_and_b32_e32 v1, s6, v6 -; VI-NEXT: v_not_b32_e32 v5, v0 -; VI-NEXT: v_not_b32_e32 v4, v1 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_and_b32_e32 v0, s7, v3 +; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_not_b32_e32 v1, v0 +; VI-NEXT: v_not_b32_e32 v0, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB56_1 @@ -2941,28 +2943,30 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 -; VI-NEXT: v_mov_b32_e32 v6, v0 -; VI-NEXT: v_and_b32_e32 v0, s7, v7 -; VI-NEXT: v_and_b32_e32 v1, s6, v6 -; VI-NEXT: v_not_b32_e32 v5, v0 -; VI-NEXT: v_not_b32_e32 v4, v1 -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_and_b32_e32 v0, s7, v3 +; VI-NEXT: v_and_b32_e32 v6, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_not_b32_e32 v1, v0 +; VI-NEXT: v_not_b32_e32 v0, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar: @@ -4434,45 +4438,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4483,17 +4487,17 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4513,14 +4517,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4542,45 +4546,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4590,29 +4594,31 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar: @@ -4621,14 +4627,14 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4651,45 +4657,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4700,23 +4706,23 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB86_1 @@ -4730,20 +4736,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB86_1 @@ -4759,45 +4765,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4807,29 +4813,31 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar: @@ -4838,20 +4846,20 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 @@ -4875,29 +4883,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4907,26 +4915,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4943,24 +4951,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4989,17 +4997,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -5026,68 +5034,68 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 @@ -5116,29 +5124,29 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5150,30 +5158,30 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v6, s3 -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm @@ -5182,24 +5190,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5227,17 +5235,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -5266,64 +5274,64 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 @@ -5896,45 +5904,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -5945,17 +5953,17 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -5975,14 +5983,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6004,45 +6012,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6052,29 +6060,31 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar: @@ -6083,14 +6093,14 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6113,45 +6123,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6162,23 +6172,23 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB100_1 @@ -6192,20 +6202,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB100_1 @@ -6221,45 +6231,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -6269,29 +6279,31 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar: @@ -6300,20 +6312,20 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 @@ -6337,29 +6349,29 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6369,26 +6381,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6405,24 +6417,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6451,17 +6463,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -6488,68 +6500,68 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 @@ -6577,17 +6589,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -6616,64 +6628,64 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB104_1 @@ -7246,45 +7258,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7295,17 +7307,17 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -7325,14 +7337,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7354,45 +7366,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7402,29 +7414,31 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar: @@ -7433,14 +7447,14 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7463,45 +7477,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7512,23 +7526,23 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB113_1 @@ -7542,20 +7556,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB113_1 @@ -7571,45 +7585,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -7619,29 +7633,31 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar: @@ -7650,20 +7666,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 @@ -8232,45 +8248,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8281,17 +8297,17 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8311,14 +8327,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8340,45 +8356,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8388,29 +8404,31 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s34 -; VI-NEXT: v_mov_b32_e32 v5, s35 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v6, s7 -; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar: @@ -8419,14 +8437,14 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -8449,45 +8467,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8498,23 +8516,23 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] ; VI-NEXT: s_cbranch_execnz .LBB123_1 @@ -8528,20 +8546,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB123_1 @@ -8557,45 +8575,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v10, s6, 0 -; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s35, s7 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: v_mov_b32_e32 v4, s35 -; SI-NEXT: v_mov_b32_e32 v5, s34 ; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v8, v0 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, v6 -; SI-NEXT: v_mov_b32_e32 v1, v7 -; SI-NEXT: v_mov_b32_e32 v2, v8 -; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] -; SI-NEXT: v_readlane_b32 s7, v10, 1 -; SI-NEXT: v_readlane_b32 s6, v10, 0 +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8605,29 +8623,31 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 -; VI-NEXT: v_mov_b32_e32 v8, v0 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] ; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar: @@ -8636,20 +8656,20 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 @@ -8673,29 +8693,29 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8705,26 +8725,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_add_u32 s0, s0, s4 +; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8741,24 +8761,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -8787,17 +8807,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -8824,68 +8844,68 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 -; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 @@ -8907,35 +8927,35 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s2 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v3 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v2, v6 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm @@ -8946,18 +8966,18 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s3 -; VI-NEXT: v_mov_b32_e32 v7, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -8974,20 +8994,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -9014,17 +9034,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v6, v2 @@ -9053,64 +9073,64 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_add_u32 s0, s0, s6 +; VI-NEXT: s_addc_u32 s1, s1, s7 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9] -; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 ; GFX9-NEXT: s_add_u32 s0, s8, s0 ; GFX9-NEXT: s_addc_u32 s1, s9, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[7:8] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 7792422291998..785114cdbf39a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -328,13 +328,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -400,14 +399,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -758,14 +757,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1367,13 +1366,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -1439,14 +1437,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1797,14 +1795,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2406,13 +2404,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -2478,14 +2475,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2836,14 +2833,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -4054,12 +4051,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -4067,7 +4064,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4083,16 +4080,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4166,20 +4163,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4191,14 +4188,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4206,19 +4203,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -4229,8 +4226,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4275,20 +4272,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4300,28 +4297,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4331,16 +4326,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4385,20 +4382,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4410,27 +4407,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4440,16 +4435,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -4484,14 +4481,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -4500,7 +4497,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4513,16 +4510,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4530,9 +4527,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4540,16 +4535,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -4586,14 +4584,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -4602,7 +4600,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4615,24 +4613,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4640,16 +4636,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -4688,29 +4684,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4731,17 +4730,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 @@ -5094,23 +5093,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5123,8 +5122,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -5134,7 +5131,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -5229,9 +5228,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5240,7 +5239,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -5248,8 +5246,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5258,8 +5257,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5267,10 +5264,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5379,7 +5379,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5387,10 +5386,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5807,13 +5807,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -5883,15 +5882,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6068,16 +6067,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6138,16 +6137,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_max_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6191,25 +6189,24 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6563,17 +6560,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6655,15 +6651,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -7602,12 +7598,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -7615,7 +7611,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -7631,16 +7627,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[41:42] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7714,20 +7710,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX9-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7739,14 +7735,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -7754,19 +7750,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -7777,8 +7773,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7823,20 +7819,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7848,28 +7844,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7879,16 +7873,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7933,20 +7929,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7958,27 +7954,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7988,16 +7982,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -8032,14 +8028,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -8048,7 +8044,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8061,16 +8057,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -8078,9 +8074,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8088,16 +8082,19 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -8134,14 +8131,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -8150,7 +8147,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8163,24 +8160,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8188,16 +8183,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 -; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -8236,29 +8231,32 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -8279,17 +8277,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 @@ -8642,23 +8640,23 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8671,8 +8669,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -8682,7 +8678,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -8777,9 +8775,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8788,7 +8786,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -8796,8 +8793,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8806,8 +8804,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8815,10 +8811,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -8901,9 +8900,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -8911,15 +8910,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8927,7 +8926,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8935,10 +8933,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index cb3291df891af..1f521f2444984 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -328,13 +328,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -400,14 +399,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -758,14 +757,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1367,13 +1366,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -1439,14 +1437,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1797,14 +1795,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2406,13 +2404,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v3, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc @@ -2478,14 +2475,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v3, s[0:1] ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v4, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v4, v0 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2836,14 +2833,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1] ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 -; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v6 +; GFX9-DPP-NEXT: v_max_f32_e32 v6, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v6, v0 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -4054,12 +4051,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX7LESS-NEXT: .LBB7_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -4067,7 +4064,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB7_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -4083,16 +4080,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4166,20 +4163,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4191,14 +4188,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -4206,19 +4203,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -4229,8 +4226,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4275,20 +4272,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1064-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4300,28 +4297,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4331,16 +4326,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -4385,20 +4382,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4410,27 +4407,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -4440,16 +4435,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -4484,14 +4481,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -4500,7 +4497,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4513,16 +4510,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -4530,9 +4527,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4540,16 +4535,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -4586,14 +4584,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -4602,7 +4600,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4615,24 +4613,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -4640,16 +4636,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -4688,29 +4684,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -4731,17 +4730,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_1 @@ -5094,23 +5093,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5123,8 +5122,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -5134,7 +5131,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -5229,9 +5228,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -5240,7 +5239,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -5248,8 +5246,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5258,8 +5257,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5267,10 +5264,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -5353,9 +5353,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5363,15 +5363,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -5379,7 +5379,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -5387,10 +5386,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5807,13 +5807,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 -; GFX7LESS-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX7LESS-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1] ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 @@ -5883,15 +5882,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX9-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6068,16 +6067,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 -; GFX1164-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1164-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6138,16 +6137,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX1132-NEXT: .LBB9_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[7:8], v[2:3], v[2:3] ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: v_min_f64 v[0:1], v[7:8], v[0:1] ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] @@ -6191,25 +6189,24 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX7LESS-DPP-NEXT: v_max_f64 v[6:7], v[4:5], v[4:5] +; GFX7LESS-DPP-NEXT: v_min_f64 v[2:3], v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6563,17 +6560,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1164-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1164-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[8:9] @@ -6655,15 +6651,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[8:9], v10, s[0:1] ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[0:1], v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: v_max_f64 v[11:12], v[8:9], v[8:9] ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[6:7], v[0:1] +; GFX1132-DPP-NEXT: v_min_f64 v[6:7], v[11:12], v[6:7] ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[6:7], v10, v[6:9], s[0:1] glc ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[8:9] @@ -7602,12 +7598,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX7LESS-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4 @@ -7615,7 +7611,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX7LESS-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX7LESS-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -7631,16 +7627,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX7LESS-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], v[41:42] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -7714,20 +7710,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX9-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1] ; GFX9-NEXT: v_readlane_b32 s3, v1, s4 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4 -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX9-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX9-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7739,14 +7735,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: ; %bb.3: ; GFX9-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX9-NEXT: s_mov_b64 s[46:47], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX9-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX9-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX9-NEXT: s_add_u32 s8, s36, 44 ; GFX9-NEXT: s_addc_u32 s9, s37, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] @@ -7754,19 +7750,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX9-NEXT: s_mov_b32 s12, s43 ; GFX9-NEXT: s_mov_b32 s13, s42 ; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 @@ -7777,8 +7773,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7823,20 +7819,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1064-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7848,28 +7844,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1064-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: v_mov_b32_e32 v5, 8 -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1064-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7879,16 +7873,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7933,20 +7929,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 ; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1032-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7958,27 +7954,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: ; %bb.3: ; GFX1032-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1032-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-NEXT: v_mov_b32_e32 v3, s45 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -7988,16 +7982,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-NEXT: s_mov_b32 s13, s42 ; GFX1032-NEXT: s_mov_b32 s14, s33 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s45 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v4, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_or_b32 s46, vcc_lo, s46 @@ -8032,14 +8028,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1] -; GFX1164-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -8048,7 +8044,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3] ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1164-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1164-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8061,16 +8057,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: ; %bb.3: ; GFX1164-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1164-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1164-NEXT: .p2align 6 ; GFX1164-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-NEXT: s_waitcnt vmcnt(0) -; GFX1164-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1164-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] @@ -8078,9 +8074,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s44 -; GFX1164-NEXT: v_mov_b32_e32 v3, s45 -; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8088,16 +8082,19 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-NEXT: s_mov_b32 s12, s43 ; GFX1164-NEXT: s_mov_b32 s13, s42 ; GFX1164-NEXT: s_mov_b32 s14, s33 -; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1164-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v3, s45 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1164-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -8134,14 +8131,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX1132-NEXT: v_mov_b32_e32 v41, 0 +; GFX1132-NEXT: v_mov_b32_e32 v42, 0x7ff80000 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[2:3], v[41:42], v[41:42] ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1 @@ -8150,7 +8147,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX1132-NEXT: v_min_f64 v[41:42], v[2:3], v[4:5] ; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -8163,24 +8160,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: ; %bb.3: ; GFX1132-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_load_b64 v[4:5], v0, s[44:45] +; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-NEXT: .p2align 6 ; GFX1132-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-NEXT: s_waitcnt vmcnt(0) -; GFX1132-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] +; GFX1132-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-NEXT: v_mov_b32_e32 v3, s45 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8188,16 +8183,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-NEXT: s_mov_b32 s12, s43 ; GFX1132-NEXT: s_mov_b32 s13, s42 ; GFX1132-NEXT: s_mov_b32 s14, s33 -; GFX1132-NEXT: v_mov_b32_e32 v6, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, s44 -; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] -; GFX1132-NEXT: scratch_store_b64 off, v[4:5], off -; GFX1132-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 8 -; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX1132-NEXT: scratch_load_b64 v[4:5], off, off +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 @@ -8236,29 +8231,32 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[46:47], 0 -; GFX7LESS-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] ; GFX7LESS-DPP-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[40:41], v[40:41] ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[4:5], v[2:3] ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 @@ -8279,17 +8277,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s43 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s42 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7LESS-DPP-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7LESS-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47] ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_1 @@ -8642,23 +8640,23 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: ; GFX1032-DPP-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 -; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8671,8 +8669,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s44 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -8682,7 +8678,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s45 @@ -8777,9 +8775,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -8788,7 +8786,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1164-DPP-NEXT: s_mov_b64 s[46:47], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] @@ -8796,8 +8793,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: .p2align 6 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8806,8 +8804,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8815,10 +8811,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s44 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s45 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 @@ -8901,9 +8900,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 -; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -8911,15 +8910,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_load_b64 s[44:45], s[34:35], 0x24 -; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[44:45] ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1132-DPP-NEXT: .p2align 6 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[41:42], v[41:42] ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: v_max_f64 v[5:6], v[1:2], v[1:2] ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] @@ -8927,7 +8926,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[40:41] ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[38:39] @@ -8935,10 +8933,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_mov_b32 s12, s43 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s42 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s44 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s45 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 5c5a769178dd9..f1d58814f9f04 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -54,18 +54,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] ; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24 +; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s12, s13 ; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s20, 0 +; GFX11-NEXT: s_mov_b32 s19, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_mov_b32 s3, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 @@ -74,7 +74,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s18, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-NEXT: s_cselect_b32 s19, -1, 0 ; GFX11-NEXT: s_bitcmp0_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 ; GFX11-NEXT: ; %bb.2: ; %bb15 @@ -110,58 +110,60 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28 ; GFX11-NEXT: s_mul_i32 s1, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 ; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, s30 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s0, s0, s30 ; GFX11-NEXT: s_mul_i32 s0, s0, s22 -; GFX11-NEXT: s_mul_i32 s0, s0, s20 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s19, s0 +; GFX11-NEXT: s_mul_i32 s0, s0, s20 +; GFX11-NEXT: s_or_b32 s0, s24, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 -; GFX11-NEXT: s_mov_b32 s0, s1 -; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-NEXT: global_load_u16 v0, v0, s[20:21] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s19 ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s20, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX11-NEXT: s_and_b32 s1, s8, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s1, s1, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-NEXT: s_and_b32 s13, 0xffff, s0 +; GFX11-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_cselect_b32 s1, s21, s20 +; GFX11-NEXT: s_and_b32 s20, 0xffff, s13 ; GFX11-NEXT: s_and_b32 s1, s1, 1 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-NEXT: s_and_b32 s20, s9, exec_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b32 s20, -1, 0 +; GFX11-NEXT: s_and_b32 s22, s9, exec_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s20 +; GFX11-NEXT: v_readfirstlane_b32 s20, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-NEXT: s_cselect_b32 s13, s19, s13 +; GFX11-NEXT: v_readfirstlane_b32 s21, v0 +; GFX11-NEXT: s_cselect_b32 s20, s21, s20 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_bitcmp1_b32 s13, 0 -; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0 -; GFX11-NEXT: s_or_b32 s0, s13, s0 +; GFX11-NEXT: s_bitcmp1_b32 s20, 0 +; GFX11-NEXT: s_cselect_b32 s20, 0x100, 0 +; GFX11-NEXT: s_or_b32 s13, s20, s13 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow ; GFX11-NEXT: s_mov_b32 s0, 0 @@ -179,7 +181,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s18, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s20, s2, exec_lo +; GFX11-NEXT: s_and_b32 s19, s2, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -196,10 +198,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s20, s20, exec_lo +; GFX11-NEXT: s_or_b32 s19, s19, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX11-NEXT: s_and_saveexec_b32 s0, s20 +; GFX11-NEXT: s_and_saveexec_b32 s0, s19 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir index dd478f94e1039..67cdf196a4693 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -32,38 +32,38 @@ body: | ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GCN-NEXT: liveins: $vcc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc ; GCN-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY1]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_1]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_2:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_2]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_3:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_3]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_4:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_4]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_5:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_5]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_6:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_6]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_7:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_7]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_8:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_8]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_9:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_9]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_10:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_10]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_11:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_11]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_12:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GCN-NEXT: liveins: $vcc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_1]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_2]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_3]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_4]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_5]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_6]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_7]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_8]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_9]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_10]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_11]], implicit $exec - ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, [[V_CVT_F64_I32_e32_12]], implicit $exec + ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_12]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_13:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec ; GCN-NEXT: $vcc = V_CMP_EQ_U64_e64 $vcc, killed [[V_CVT_F64_I32_e32_13]], implicit $exec ; GCN-NEXT: [[V_CVT_F64_I32_e32_14:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index eb2d95e4db2d5..b079637103fa4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; CHECK-LABEL: struct_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; CHECK-LABEL: struct_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB4_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; CHECK-LABEL: struct_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; CHECK-LABEL: struct_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB10_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_load_b32 v2, v[2:3] +; CHECK-NEXT: flat_load_b32 v1, v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB10_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index bc50b12b59049..e056f05aea451 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -6,20 +6,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -67,20 +67,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -99,20 +99,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 4 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -130,20 +130,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB4_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 dlc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB4_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -194,20 +194,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1] -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -227,20 +227,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -260,23 +260,23 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -297,20 +297,20 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b128 v[1:4], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm @@ -330,22 +330,22 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34 ; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_b32 s4, s[4:5], 0x34 ; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB10_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: buffer_load_b64 v[1:2], v1, s[0:3], 0 idxen offset:4 glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_load_b32 v2, v[2:3] +; CHECK-NEXT: flat_load_b32 v1, v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB10_1 ; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 23b57a7efa586..f961b7d9d5223 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -5818,39 +5818,38 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5865,33 +5864,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -5905,29 +5904,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5944,29 +5943,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5984,30 +5983,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6125,39 +6124,38 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_add_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6172,33 +6170,33 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6212,29 +6210,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6251,29 +6249,29 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6291,30 +6289,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6432,39 +6430,39 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6477,34 +6475,34 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -6515,35 +6513,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6553,35 +6551,35 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6592,36 +6590,36 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6729,39 +6727,39 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6774,34 +6772,34 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -6812,35 +6810,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6850,35 +6848,35 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6889,36 +6887,36 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_add_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7630,21 +7628,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB28_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: ds_read_b32 v1, v2 +; GFX7-NEXT: s_lshl_b32 s10, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: ds_read_b32 v1, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 @@ -7660,22 +7659,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 ; GFX7-NEXT: .LBB28_7: ; %Flow22 @@ -7710,23 +7710,24 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB28_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v3 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: .LBB28_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -7754,21 +7755,22 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB28_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: ds_read_b32 v1, v2 +; GFX6-NEXT: s_lshl_b32 s10, s3, 3 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 @@ -7784,22 +7786,23 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s3, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: s_lshl_b32 s3, s3, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 ; GFX6-NEXT: .LBB28_7: ; %Flow20 @@ -7834,23 +7837,24 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB28_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v3 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: .LBB28_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -8467,21 +8471,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX7-NEXT: s_cbranch_execz .LBB29_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_lshl_b32 s8, s3, 3 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: ds_read_b32 v1, v2 +; GFX7-NEXT: s_lshl_b32 s10, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: ds_read_b32 v1, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 @@ -8497,22 +8502,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s3, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_read_b32 v3, v1 +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v2, v1 ; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 ; GFX7-NEXT: .LBB29_7: ; %Flow22 @@ -8547,23 +8553,24 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX7-NEXT: s_cbranch_execz .LBB29_13 ; GFX7-NEXT: ; %bb.10: -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v2, v3 -; GFX7-NEXT: s_mov_b64 s[2:3], 0 +; GFX7-NEXT: ds_read_b32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_11 ; GFX7-NEXT: ; %bb.12: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: .LBB29_13: ; %Flow20 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -8591,21 +8598,22 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_cbranch_execz .LBB29_4 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_lshl_b32 s8, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: ds_read_b32 v1, v2 +; GFX6-NEXT: s_lshl_b32 s10, s3, 3 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 @@ -8621,22 +8629,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s3, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_read_b32 v3, v1 +; GFX6-NEXT: s_lshl_b32 s3, s3, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 -; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX6-NEXT: v_add_f32_e32 v3, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v4, v2, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v2 ; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 ; GFX6-NEXT: .LBB29_7: ; %Flow20 @@ -8671,23 +8680,24 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] ; GFX6-NEXT: s_cbranch_execz .LBB29_13 ; GFX6-NEXT: ; %bb.10: -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v2, v3 -; GFX6-NEXT: s_mov_b64 s[2:3], 0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_11: ; %atomicrmw.start8 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_add_f32_e32 v4, v3, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v2, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_11 ; GFX6-NEXT: ; %bb.12: ; %Flow -; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: .LBB29_13: ; %Flow18 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index d419b0cdfdd1a..47058de71e7f4 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 @@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 @@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 @@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_max_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 @@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 @@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_max_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 @@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_max_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5064,14 +5064,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5093,13 +5093,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX940-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5115,14 +5115,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5141,13 +5141,13 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5166,12 +5166,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5188,12 +5188,12 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5209,23 +5209,23 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5326,14 +5326,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5355,13 +5355,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX940-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5377,14 +5377,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5403,13 +5403,13 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5428,12 +5428,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5450,12 +5450,12 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_max_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5471,23 +5471,23 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_max_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5594,37 +5594,36 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v5, v2 +; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5641,30 +5640,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5680,39 +5679,38 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5727,33 +5725,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -5767,29 +5765,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5806,29 +5804,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5846,30 +5844,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5971,37 +5969,36 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_dual_max_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_num_f32_e32 v2, v5, v2 +; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6018,30 +6015,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6057,39 +6054,38 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_max_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6104,33 +6100,33 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6144,29 +6140,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6183,29 +6179,29 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6223,30 +6219,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6348,37 +6344,37 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6392,36 +6388,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6431,39 +6427,39 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6476,34 +6472,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -6514,35 +6510,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6552,35 +6548,35 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6591,36 +6587,36 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6712,37 +6708,37 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_max_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6756,36 +6752,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6795,39 +6791,39 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6840,34 +6836,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -6878,35 +6874,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6916,35 +6912,35 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6955,36 +6951,36 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_max_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 282947afa409a..1c9cff7326d65 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -4520,15 +4520,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4550,17 +4550,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB20_1 @@ -4573,15 +4573,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4600,14 +4600,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4626,16 +4626,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 @@ -4649,16 +4649,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 @@ -4673,17 +4673,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -4792,15 +4792,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 -; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1 +; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_max_num_f16 v4, v3, v3 +; GFX12-NEXT: v_pk_min_num_f16 v2, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -4822,17 +4822,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX940-NEXT: v_pk_max_f16 v2, v4, v4 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB21_1 @@ -4845,15 +4845,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX11-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX11-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -4872,14 +4872,14 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX10-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v3, v3 +; GFX10-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -4898,16 +4898,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX90A-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 @@ -4921,16 +4921,16 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 -; GFX908-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 +; GFX908-NEXT: v_pk_max_f16 v2, v4, v4 +; GFX908-NEXT: v_pk_min_f16 v2, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB21_1 @@ -4945,17 +4945,17 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX8-NEXT: v_max_f16_sdwa v5, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v6, v4, v4 +; GFX8-NEXT: v_min_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5064,14 +5064,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5093,13 +5093,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX940-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5115,14 +5115,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5141,13 +5141,13 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5166,12 +5166,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5188,12 +5188,12 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5209,23 +5209,23 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5326,14 +5326,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_pk_max_num_f16 v3, v1, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2 +; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1 +; GFX12-NEXT: v_pk_min_num_f16 v3, v4, v3 ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -5355,13 +5355,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX940-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5377,14 +5377,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v3, v2, v2 +; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX11-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5403,13 +5403,13 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX10-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX10-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5428,12 +5428,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX90A-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5450,12 +5450,12 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_pk_max_f16 v3, v1, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_pk_max_f16 v3, v2, v2 -; GFX908-NEXT: v_pk_min_f16 v3, v3, v1 +; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX908-NEXT: v_pk_min_f16 v3, v4, v3 ; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 @@ -5471,23 +5471,23 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_max_f16_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v3, v3 -; GFX8-NEXT: v_min_f16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v5, v5, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX8-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5594,37 +5594,36 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v5, v2 +; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5641,30 +5640,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5680,39 +5679,38 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -5727,33 +5725,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -5767,29 +5765,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5806,29 +5804,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5846,30 +5844,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -5971,37 +5969,36 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_dual_min_num_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_min_num_f32_e32 v2, v5, v2 +; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6018,30 +6015,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX940-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6057,39 +6054,38 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_min_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6104,33 +6100,33 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6144,29 +6140,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6183,29 +6179,29 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6223,30 +6219,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6348,37 +6344,37 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6392,36 +6388,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6431,39 +6427,39 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6476,34 +6472,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -6514,35 +6510,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6552,35 +6548,35 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6591,36 +6587,36 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6712,37 +6708,37 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_min_num_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_min_num_f32_e32 v4, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6756,36 +6752,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6795,39 +6791,39 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_min_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6840,34 +6836,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -6878,35 +6874,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6916,35 +6912,35 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6955,36 +6951,36 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_min_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 1b08b64b046b4..2433eca80b23c 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -6390,37 +6390,36 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6437,30 +6436,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6476,39 +6475,38 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6523,33 +6521,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 @@ -6563,29 +6561,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6602,29 +6600,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6642,30 +6640,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6767,37 +6765,36 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -6814,30 +6811,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX940-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX940-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX940-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v3, v2, s5 ; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6853,39 +6850,38 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_lshlrev_b32 v2, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_dual_sub_f32 v4, v6, v4 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff ; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v7, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -6900,33 +6896,33 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX10-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_sub_f32_e32 v2, v5, v2 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v7, s4 +; GFX10-NEXT: v_perm_b32 v2, v4, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 @@ -6940,29 +6936,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX90A-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -6979,29 +6975,29 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX908-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX908-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX908-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v2, v3, v2, s9 ; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7019,30 +7015,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 +; GFX8-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX8-NEXT: v_sub_f32_e32 v2, v6, v2 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7144,37 +7140,37 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: ds_load_b32 v2, v0 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7188,36 +7184,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: ds_read_b32 v2, v0 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7227,39 +7223,39 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7272,34 +7268,34 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 @@ -7310,35 +7306,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: ds_read_b32 v2, v0 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7348,35 +7344,35 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: ds_read_b32 v2, v0 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7387,36 +7383,36 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7508,37 +7504,37 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX12-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX12-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -7552,36 +7548,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 ; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX940-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX940-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX940-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] -; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 -; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX940-NEXT: v_perm_b32 v3, v4, v3, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7591,39 +7587,39 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX11-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_sub_f32 v3, v4, v3 :: v_dual_and_b32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 ; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 @@ -7636,34 +7632,34 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX10-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX10-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX10-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v4, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX10-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1 @@ -7674,35 +7670,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX90A-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX90A-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 ; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7712,35 +7708,35 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX908-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX908-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 ; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX908-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX908-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX908-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX908-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s8 +; GFX908-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX908-NEXT: v_perm_b32 v3, v4, v3, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end @@ -7751,36 +7747,36 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 +; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX8-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_sub_f32_e32 v4, v6, v5 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 8157b1a7f7c80..be8a6295c8a71 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -161,13 +161,13 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 ; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_mov_b64 s[6:7], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual -; CHECK-NEXT: s_add_u32 s4, s6, 1 -; CHECK-NEXT: s_addc_u32 s5, s7, 0 -; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; CHECK-NEXT: s_add_u32 s6, s6, 1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; CHECK-NEXT: s_mov_b64 s[6:7], 1 ; CHECK-NEXT: s_cbranch_vccnz .LBB1_6 ; CHECK-NEXT: ; %bb.7: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index a9b8663a48dea..86a6ad46e8768 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -5,23 +5,21 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s0, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: s_mov_b32 s1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_cmp_lg_u32 s0, 0 -; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: s_cmp_lg_u32 s2, 0 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX942-NEXT: s_or_b32 s4, s3, 1 -; GFX942-NEXT: s_ashr_i32 s5, s3, 31 -; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX942-NEXT: s_or_b32 s4, s1, 1 +; GFX942-NEXT: s_ashr_i32 s5, s1, 31 +; GFX942-NEXT: s_mov_b32 s1, s0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX942-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 @@ -29,56 +27,54 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX942-NEXT: s_and_b32 s3, s5, s4 +; GFX942-NEXT: s_and_b32 s1, s5, s4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3] ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: -; GFX942-NEXT: ; implicit-def: $sgpr3 +; GFX942-NEXT: ; implicit-def: $sgpr1 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; ; GFX908-LABEL: matmul_kernel: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: s_mov_b32 s2, 0 -; GFX908-NEXT: s_mov_b32 s3, 0 +; GFX908-NEXT: s_mov_b32 s0, 0 +; GFX908-NEXT: s_mov_b32 s1, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_cmp_lg_u32 s0, 0 -; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX908-NEXT: s_cmp_lg_u32 s2, 0 +; GFX908-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX908-NEXT: s_or_b32 s4, s3, 1 -; GFX908-NEXT: s_ashr_i32 s5, s3, 31 -; GFX908-NEXT: s_mov_b32 s3, s2 +; GFX908-NEXT: s_or_b32 s4, s1, 1 +; GFX908-NEXT: s_ashr_i32 s5, s1, 31 +; GFX908-NEXT: s_mov_b32 s1, s0 ; GFX908-NEXT: s_nop 3 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_mov_b32_e32 v5, s3 -; GFX908-NEXT: v_mov_b32_e32 v4, s2 +; GFX908-NEXT: v_mov_b32_e32 v5, s1 +; GFX908-NEXT: v_mov_b32_e32 v4, s0 ; GFX908-NEXT: v_mov_b32_e32 v2, v1 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: s_and_b32 s3, s5, s4 +; GFX908-NEXT: s_and_b32 s1, s5, s4 ; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3] ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: ; implicit-def: $sgpr3 +; GFX908-NEXT: ; implicit-def: $sgpr1 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 3e45a2d0df43d..c80254210109a 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -23,10 +23,8 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: ; Child Loop BB0_4 Depth 2 ; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GCN-NEXT: s_mov_b32 s12, s6 ; GCN-NEXT: s_branch .LBB0_4 ; GCN-NEXT: .LBB0_3: ; %Flow1 @@ -36,7 +34,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: .LBB0_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN-NEXT: s_lshl_b32 s12, s12, 5 ; GCN-NEXT: s_cbranch_vccz .LBB0_6 ; GCN-NEXT: ; %bb.5: ; in Loop: Header=BB0_4 Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 96dd6276f7e38..cb9f6c1f38eea 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1744,8 +1744,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 23364e860d154..c1cc61bbacd0f 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1865,8 +1865,8 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB13_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 12eec4fa3bd59..15dc545175055 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -32,71 +32,51 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77] -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] -; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 -; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[64:65] ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s66, 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[68:69], -1, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS1-NEXT: s_xor_b64 s[70:71], s[68:69], -1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 -; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: s_xor_b64 s[72:73], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 -; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 +; GLOBALNESS1-NEXT: s_mov_b32 s60, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 +; GLOBALNESS1-NEXT: s_mov_b32 s61, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s62, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] +; GLOBALNESS1-NEXT: s_xor_b64 s[74:75], s[4:5], -1 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[46:47], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[50:51], 0, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -125,23 +105,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s67, 1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s67, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_8 @@ -151,7 +131,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s67, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 @@ -163,15 +143,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[52:53], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[74:75], s[62:63] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[76:77], s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[44:45] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -183,72 +163,70 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1] +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[56:57], 0, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_16 ; GLOBALNESS1-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[74:75] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[70:71] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[72:73] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[42:43] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[46:47] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s68, s38, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s69, s39, 0 +; GLOBALNESS1-NEXT: s_add_u32 s58, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s59, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[4:5], 0x0 +; GLOBALNESS1-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[58:59] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[58:59] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -264,12 +242,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[74:75] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[76:77] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[48:49] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -293,9 +271,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -311,9 +289,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s62 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s61 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s60 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -324,71 +302,51 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73] -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] -; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 -; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[64:65] ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 +; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s66, 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[68:69], -1, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS0-NEXT: s_xor_b64 s[70:71], s[68:69], -1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 -; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1] -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], v[0:1] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[64:65], s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[72:73], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 -; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 +; GLOBALNESS0-NEXT: s_mov_b32 s58, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 +; GLOBALNESS0-NEXT: s_mov_b32 s59, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s60, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] +; GLOBALNESS0-NEXT: s_xor_b64 s[74:75], s[4:5], -1 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v0 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[44:45], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[46:47], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[48:49], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[50:51], 0, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -417,23 +375,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[68:69] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s67, 1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s67, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_8 @@ -443,7 +401,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: .LBB1_8: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s67, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 @@ -455,15 +413,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[52:53], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[62:63] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[76:77], s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[44:45] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -475,72 +433,70 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[54:55], 0, v[0:1] +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[56:57], 0, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_16 ; GLOBALNESS0-NEXT: .LBB1_14: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[74:75] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_16: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[70:71] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[72:73] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[42:43] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: .LBB1_21: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s72, s38, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s73, s39, 0 +; GLOBALNESS0-NEXT: s_add_u32 s62, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s63, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[4:5], 0x0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 @@ -557,11 +513,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[76:77] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[48:49] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -585,9 +541,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 @@ -603,9 +559,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s60 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s59 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s58 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index db7d816386a70..089f3d255e87a 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1187,8 +1187,8 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 0acee5bd5ac19..945b87b6e1628 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -52,13 +52,10 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 { ; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; CHECK-NEXT: .LBB1_1: ; %bb9 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB1_1 ; CHECK-NEXT: ; %bb.2: ; %bb11 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index a794d139063d5..bc7b375cd404d 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1294,8 +1294,8 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 +; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index 0211c5111c31d..41d324dd7abfc 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -6,15 +6,15 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: .LBB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 +; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll index d1c4459aaa6ee..2bd89dacb2b37 100644 --- a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll +++ b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll @@ -20,101 +20,101 @@ define void @jr_without_ra(ptr %rtwdev, ptr %chan, ptr %h2c, i8 %.pre, i1 %cmp.i ; CHECK-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill ; CHECK-NEXT: st.d $s7, $sp, 16 # 8-byte Folded Spill ; CHECK-NEXT: st.d $s8, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: move $s7, $zero -; CHECK-NEXT: move $s0, $zero +; CHECK-NEXT: move $s6, $zero +; CHECK-NEXT: move $s1, $zero ; CHECK-NEXT: ld.d $t0, $sp, 184 -; CHECK-NEXT: ld.d $s2, $sp, 176 -; CHECK-NEXT: ld.d $s1, $sp, 168 -; CHECK-NEXT: ld.d $t1, $sp, 160 -; CHECK-NEXT: ld.d $t2, $sp, 152 -; CHECK-NEXT: ld.d $t3, $sp, 144 -; CHECK-NEXT: ld.d $t4, $sp, 136 -; CHECK-NEXT: ld.d $t5, $sp, 128 -; CHECK-NEXT: ld.d $t6, $sp, 120 -; CHECK-NEXT: ld.d $t7, $sp, 112 -; CHECK-NEXT: ld.d $t8, $sp, 104 -; CHECK-NEXT: ld.d $fp, $sp, 96 +; CHECK-NEXT: ld.d $t1, $sp, 176 +; CHECK-NEXT: ld.d $s2, $sp, 168 +; CHECK-NEXT: ld.d $t2, $sp, 160 +; CHECK-NEXT: ld.d $t3, $sp, 152 +; CHECK-NEXT: ld.d $t4, $sp, 144 +; CHECK-NEXT: ld.d $t5, $sp, 136 +; CHECK-NEXT: ld.d $t6, $sp, 128 +; CHECK-NEXT: ld.d $t7, $sp, 120 +; CHECK-NEXT: ld.d $t8, $sp, 112 +; CHECK-NEXT: ld.d $fp, $sp, 104 +; CHECK-NEXT: ld.d $s0, $sp, 96 ; CHECK-NEXT: andi $a4, $a4, 1 -; CHECK-NEXT: alsl.d $a6, $a6, $s1, 4 -; CHECK-NEXT: pcalau12i $s1, %pc_hi20(.LJTI0_0) -; CHECK-NEXT: addi.d $s1, $s1, %pc_lo12(.LJTI0_0) -; CHECK-NEXT: slli.d $s3, $s2, 2 -; CHECK-NEXT: alsl.d $s2, $s2, $s3, 1 -; CHECK-NEXT: add.d $s2, $t5, $s2 -; CHECK-NEXT: addi.w $s4, $zero, -41 +; CHECK-NEXT: alsl.d $a6, $a6, $s2, 4 +; CHECK-NEXT: pcalau12i $s2, %pc_hi20(.LJTI0_0) +; CHECK-NEXT: addi.d $s2, $s2, %pc_lo12(.LJTI0_0) ; CHECK-NEXT: ori $s3, $zero, 1 -; CHECK-NEXT: slli.d $s4, $s4, 3 -; CHECK-NEXT: ori $s6, $zero, 3 -; CHECK-NEXT: lu32i.d $s6, 262144 +; CHECK-NEXT: ori $s4, $zero, 50 +; CHECK-NEXT: ori $s5, $zero, 3 +; CHECK-NEXT: lu32i.d $s5, 262144 ; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB0_1: # %sw.bb27.i.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ori $s8, $zero, 1 +; CHECK-NEXT: ori $s7, $zero, 1 ; CHECK-NEXT: .LBB0_2: # %if.else.i106 ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: alsl.d $s5, $s0, $s0, 3 -; CHECK-NEXT: alsl.d $s0, $s5, $s0, 1 -; CHECK-NEXT: add.d $s0, $t0, $s0 -; CHECK-NEXT: ldx.bu $s8, $s0, $s8 +; CHECK-NEXT: alsl.d $s8, $s1, $s1, 3 +; CHECK-NEXT: alsl.d $s1, $s8, $s1, 1 +; CHECK-NEXT: add.d $s1, $t0, $s1 +; CHECK-NEXT: ldx.bu $s7, $s1, $s7 ; CHECK-NEXT: .LBB0_3: # %phy_tssi_get_ofdm_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: st.b $zero, $t5, 0 -; CHECK-NEXT: st.b $s7, $t3, 0 -; CHECK-NEXT: st.b $zero, $t8, 0 -; CHECK-NEXT: st.b $zero, $t1, 0 -; CHECK-NEXT: st.b $zero, $a1, 0 +; CHECK-NEXT: st.b $zero, $t6, 0 +; CHECK-NEXT: st.b $s6, $t4, 0 +; CHECK-NEXT: st.b $zero, $fp, 0 ; CHECK-NEXT: st.b $zero, $t2, 0 -; CHECK-NEXT: st.b $s8, $a5, 0 -; CHECK-NEXT: ori $s0, $zero, 1 -; CHECK-NEXT: move $s7, $a3 +; CHECK-NEXT: st.b $zero, $a1, 0 +; CHECK-NEXT: st.b $zero, $t3, 0 +; CHECK-NEXT: st.b $s7, $a5, 0 +; CHECK-NEXT: ori $s1, $zero, 1 +; CHECK-NEXT: move $s6, $a3 ; CHECK-NEXT: .LBB0_4: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: beqz $a4, .LBB0_9 ; CHECK-NEXT: # %bb.5: # %calc_6g.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: move $s6, $zero ; CHECK-NEXT: bnez $zero, .LBB0_8 ; CHECK-NEXT: # %bb.6: # %calc_6g.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: slli.d $s8, $zero, 3 -; CHECK-NEXT: ldx.d $s8, $s8, $s1 -; CHECK-NEXT: jr $s8 +; CHECK-NEXT: slli.d $s7, $zero, 3 +; CHECK-NEXT: ldx.d $s7, $s7, $s2 +; CHECK-NEXT: jr $s7 ; CHECK-NEXT: .LBB0_7: # %sw.bb12.i.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ori $s7, $zero, 1 +; CHECK-NEXT: ori $s6, $zero, 1 ; CHECK-NEXT: .LBB0_8: # %if.else58.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ldx.bu $s7, $a6, $s7 +; CHECK-NEXT: ldx.bu $s6, $a6, $s6 ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB0_9: # %if.end.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: andi $s7, $s7, 255 -; CHECK-NEXT: ori $s5, $zero, 50 -; CHECK-NEXT: bltu $s5, $s7, .LBB0_15 +; CHECK-NEXT: andi $s6, $s6, 255 +; CHECK-NEXT: bltu $s4, $s6, .LBB0_15 ; CHECK-NEXT: # %bb.10: # %if.end.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: sll.d $s7, $s3, $s7 -; CHECK-NEXT: and $s8, $s7, $s6 -; CHECK-NEXT: move $s7, $fp -; CHECK-NEXT: beqz $s8, .LBB0_15 +; CHECK-NEXT: sll.d $s6, $s3, $s6 +; CHECK-NEXT: and $s7, $s6, $s5 +; CHECK-NEXT: move $s6, $s0 +; CHECK-NEXT: beqz $s7, .LBB0_15 ; CHECK-NEXT: .LBB0_11: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: move $s8, $zero -; CHECK-NEXT: st.b $zero, $t7, 0 -; CHECK-NEXT: ldx.b $ra, $s2, $t4 +; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: st.b $zero, $t8, 0 +; CHECK-NEXT: slli.d $s8, $t1, 2 +; CHECK-NEXT: alsl.d $s8, $t1, $s8, 1 +; CHECK-NEXT: add.d $s8, $t6, $s8 +; CHECK-NEXT: ldx.b $s8, $s8, $t5 ; CHECK-NEXT: st.b $zero, $a2, 0 ; CHECK-NEXT: st.b $zero, $a7, 0 -; CHECK-NEXT: st.b $zero, $t6, 0 -; CHECK-NEXT: st.b $ra, $a0, 0 +; CHECK-NEXT: st.b $zero, $t7, 0 +; CHECK-NEXT: st.b $s8, $a0, 0 ; CHECK-NEXT: bnez $s3, .LBB0_13 ; CHECK-NEXT: # %bb.12: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: addi.w $s8, $zero, -41 +; CHECK-NEXT: slli.d $s8, $s8, 3 ; CHECK-NEXT: pcalau12i $ra, %pc_hi20(.LJTI0_1) ; CHECK-NEXT: addi.d $ra, $ra, %pc_lo12(.LJTI0_1) -; CHECK-NEXT: ldx.d $s5, $s4, $ra -; CHECK-NEXT: jr $s5 +; CHECK-NEXT: ldx.d $s8, $s8, $ra +; CHECK-NEXT: jr $s8 ; CHECK-NEXT: .LBB0_13: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: bnez $s3, .LBB0_1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index c35f05be304cc..ec2448cb3965f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -489,8 +489,9 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64-NEXT: j .LBB0_11 ; RV64-NEXT: .LBB0_8: # %vector.ph ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 -; RV64-NEXT: slli t6, t0, 28 -; RV64-NEXT: sub t6, t6, t1 +; RV64-NEXT: slli t6, t0, 1 +; RV64-NEXT: slli s0, t0, 28 +; RV64-NEXT: sub t6, s0, t6 ; RV64-NEXT: and t6, t6, a6 ; RV64-NEXT: csrwi vxrm, 0 ; RV64-NEXT: mv s0, a2 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll index d076cb00ad7e0..6d082802f9cd7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -353,8 +353,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov lr, r0 @@ -364,50 +364,48 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: csel r7, r2, r0, lt +; CHECK-NEXT: csel r3, r2, r0, lt ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: cmp r7, #3 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: it ls ; CHECK-NEXT: movls r1, #3 ; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: subs r1, r1, r7 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: movw r2, #43691 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: movt r2, #43690 -; CHECK-NEXT: ldr r6, [sp, #128] -; CHECK-NEXT: movw r8, :lower16:c +; CHECK-NEXT: ldr r6, [sp, #112] +; CHECK-NEXT: movw r9, :lower16:c ; CHECK-NEXT: umull r1, r2, r1, r2 -; CHECK-NEXT: movt r8, :upper16:c +; CHECK-NEXT: adr.w r8, .LCPI1_1 ; CHECK-NEXT: movs r1, #4 -; CHECK-NEXT: @ implicit-def: $r10 ; CHECK-NEXT: @ implicit-def: $r5 ; CHECK-NEXT: @ implicit-def: $r11 -; CHECK-NEXT: mov.w r9, #12 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: @ implicit-def: $r7 +; CHECK-NEXT: movt r9, :upper16:c +; CHECK-NEXT: mov.w r10, #12 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: add.w r1, r1, r2, lsr #1 ; CHECK-NEXT: add.w r0, r0, r2, lsr #1 -; CHECK-NEXT: bic r3, r1, #3 +; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI1_1 -; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vdup.32 q5, r0 ; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vadd.i32 q4, q0, r7 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: strd r3, r7, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r2, r4, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: vadd.i32 q4, q0, r3 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: cmn.w r11, #4 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: cmn.w r7, #4 ; CHECK-NEXT: it le ; CHECK-NEXT: mvnle r0, #3 ; CHECK-NEXT: movw r2, #18725 ; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: movt r2, #9362 -; CHECK-NEXT: sub.w r1, r0, r11 -; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: subs r1, r0, r7 ; CHECK-NEXT: umull r2, r3, r1, r2 ; CHECK-NEXT: subs r2, r1, r3 ; CHECK-NEXT: add.w r2, r3, r2, lsr #1 @@ -415,19 +413,18 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: lsls r3, r3, #3 ; CHECK-NEXT: sub.w r2, r3, r2, lsr #2 ; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #7 +; CHECK-NEXT: adds r7, r0, #7 ; CHECK-NEXT: .LBB1_4: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: adds r5, #2 -; CHECK-NEXT: subs.w r1, r5, lr -; CHECK-NEXT: asr.w r0, r5, #31 +; CHECK-NEXT: add.w r11, r11, #2 +; CHECK-NEXT: subs.w r1, r11, lr +; CHECK-NEXT: asr.w r0, r11, #31 ; CHECK-NEXT: sbcs.w r0, r0, r12 ; CHECK-NEXT: bge.w .LBB1_28 ; CHECK-NEXT: .LBB1_6: @ %for.cond2.preheader @@ -436,36 +433,35 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Child Loop BB1_10 Depth 2 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_14 Depth 3 -; CHECK-NEXT: cmp.w r11, #2 +; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: bgt .LBB1_5 ; CHECK-NEXT: @ %bb.7: @ %for.body6.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: cmp r7, #5 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #5 ; CHECK-NEXT: bhi .LBB1_17 ; CHECK-NEXT: @ %bb.8: @ %for.body6.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: ldrd r2, r3, [sp, #120] +; CHECK-NEXT: ldrd r2, r3, [sp, #104] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r4, r6 -; CHECK-NEXT: mov r7, r12 -; CHECK-NEXT: mov r6, lr +; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r4, lr ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: mov lr, r6 -; CHECK-NEXT: mov r6, r4 -; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov lr, r4 +; CHECK-NEXT: mov r12, r6 +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: ldr r6, [sp, #112] +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: b .LBB1_10 ; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us ; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2 -; CHECK-NEXT: add.w r11, r0, #7 +; CHECK-NEXT: adds r7, r0, #7 ; CHECK-NEXT: cmn.w r0, #4 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bge .LBB1_5 ; CHECK-NEXT: .LBB1_10: @ %for.body6.us ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 @@ -488,13 +484,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vqadd.u32 q2, q5, r1 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vcmp.u32 hi, q7, q2 -; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vldrw.u32 q2, [r8] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q2, q2, r8 -; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vqadd.u32 q2, q2, r1 +; CHECK-NEXT: add.w r1, r1, #4 +; CHECK-NEXT: vcmp.u32 hi, q6, q2 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vadd.i32 q2, q2, r9 +; CHECK-NEXT: vadd.i32 q1, q1, r10 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_12 @@ -507,13 +504,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vqadd.u32 q2, q5, r1 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vcmp.u32 hi, q6, q2 -; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vldrw.u32 q2, [r8] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q2, q2, r8 -; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vqadd.u32 q2, q2, r1 +; CHECK-NEXT: add.w r1, r1, #4 +; CHECK-NEXT: vcmp.u32 hi, q5, q2 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vadd.i32 q2, q2, r9 +; CHECK-NEXT: vadd.i32 q1, q1, r10 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_14 @@ -523,7 +521,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: beq .LBB1_9 ; CHECK-NEXT: @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2 -; CHECK-NEXT: eor r1, r10, #1 +; CHECK-NEXT: eor r1, r5, #1 ; CHECK-NEXT: lsls r1, r1, #31 ; CHECK-NEXT: bne .LBB1_9 ; CHECK-NEXT: b .LBB1_26 @@ -532,11 +530,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.18: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: .LBB1_19: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lsls.w r1, r10, #31 +; CHECK-NEXT: lsls r1, r5, #31 ; CHECK-NEXT: bne .LBB1_27 ; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63 ; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2 @@ -552,19 +550,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: bgt .LBB1_25 ; CHECK-NEXT: @ %bb.23: @ %for.cond.cleanup17.us63.3 ; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2 -; CHECK-NEXT: add.w r11, r0, #28 +; CHECK-NEXT: add.w r7, r0, #28 ; CHECK-NEXT: cmn.w r0, #25 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: blt .LBB1_19 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #14 +; CHECK-NEXT: add.w r7, r0, #14 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #21 +; CHECK-NEXT: add.w r7, r0, #21 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_26: @ %for.inc19.us ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -574,7 +572,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: b .LBB1_27 ; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index e63c62574dafb..83ea44704e63f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -594,71 +594,71 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #28 -; CHECK-NEXT: sub sp, #28 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: blt .LBB13_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: add r2, sp, #12 -; CHECK-NEXT: mov.w r9, #8 +; CHECK-NEXT: mov.w r10, #8 ; CHECK-NEXT: bic r1, r1, #7 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: sub.w r3, r1, #8 -; CHECK-NEXT: add.w r8, r6, r3, lsr #3 -; CHECK-NEXT: adr r3, .LCPI13_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: sub.w r7, r1, #8 +; CHECK-NEXT: add.w r0, r6, r7, lsr #3 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: .LBB13_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adr r1, .LCPI13_0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: .LBB13_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q1, [r2] -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: vldrh.s32 q2, [r2, #8] -; CHECK-NEXT: vadd.i16 q1, q1, r9 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r7, r5, d5 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: vldrh.s32 q2, [r2] -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, r10, d5 -; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vadd.i16 q0, q0, r10 +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vldrh.s32 q1, [r0] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vmov r12, r11, d3 +; CHECK-NEXT: ldrh.w r8, [r6] +; CHECK-NEXT: vmov r2, r6, d2 ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh.w r2, [r10] -; CHECK-NEXT: ldrh.w r10, [r3] -; CHECK-NEXT: vmov r3, r11, d4 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh.w r11, [r11] -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r11 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: vmov.16 q2[4], r10 -; CHECK-NEXT: vmov.16 q2[5], r4 -; CHECK-NEXT: vmov.16 q2[6], r7 -; CHECK-NEXT: vmov.16 q2[7], r5 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 +; CHECK-NEXT: ldrh.w r1, [r11] +; CHECK-NEXT: ldrh.w r11, [r5] +; CHECK-NEXT: ldrh.w r5, [r12] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q1[1], r6 +; CHECK-NEXT: vmov.16 q1[2], r5 +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.16 q1[4], r11 +; CHECK-NEXT: vmov.16 q1[5], r4 +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.16 q1[7], r8 +; CHECK-NEXT: vstrb.8 q1, [r7], #16 ; CHECK-NEXT: le lr, .LBB13_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r3, r1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: bne .LBB13_2 ; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #28 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: @@ -711,145 +711,144 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: blt.w .LBB14_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI14_2 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: movs r2, #1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add r4, sp, #72 +; CHECK-NEXT: add r7, sp, #40 +; CHECK-NEXT: add r5, sp, #56 ; CHECK-NEXT: bic r1, r1, #7 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q2, #0x18 -; CHECK-NEXT: add.w r1, r2, r1, lsr #3 -; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill -; CHECK-NEXT: adr r1, .LCPI14_0 -; CHECK-NEXT: adr r2, .LCPI14_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: add r2, sp, #120 -; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q6, #0x18 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: add.w r1, r3, r1, lsr #3 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: add.w r10, sp, #104 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: adr r1, .LCPI14_2 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: adr r1, .LCPI14_1 +; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: .LBB14_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q5, [r2] -; CHECK-NEXT: mov r8, r2 -; CHECK-NEXT: vldrh.s32 q0, [r2, #8] +; CHECK-NEXT: vstrw.32 q1, [r4] +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: mov r11, r4 +; CHECK-NEXT: mov r5, r7 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r2] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vmov r6, r2, d4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh.w r12, [r4] -; CHECK-NEXT: add r4, sp, #88 -; CHECK-NEXT: ldrh.w r11, [r5] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r5, [r6] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vstrw.32 q6, [r4] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r6, r10, d1 ; CHECK-NEXT: vldrh.s32 q0, [r4] -; CHECK-NEXT: vmov.16 q7[0], r5 -; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q6, q0, r0 +; CHECK-NEXT: vmov r7, r4, d12 +; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: ldrh.w r2, [r10] +; CHECK-NEXT: str r2, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: ldrh.w r8, [r3] +; CHECK-NEXT: ldrh r3, [r6] +; CHECK-NEXT: ldrh r2, [r7] +; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vstrw.32 q3, [r7] +; CHECK-NEXT: vldrh.s32 q0, [r7] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.16 q4[1], r4 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, r9, d0 -; CHECK-NEXT: vmov r2, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: vmov r4, r6, d0 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vldrh.s32 q0, [r7, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q1[0], r6 -; CHECK-NEXT: ldrh.w r6, [r9] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q1[1], r6 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: vmov.16 q1[3], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: ldrh r4, [r6] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, r5, d1 -; CHECK-NEXT: vmov.16 q1[5], r6 -; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov.16 q5[2], r1 +; CHECK-NEXT: vmov r1, r4, d0 +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q5[4], r1 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vstrw.32 q4, [r10] -; CHECK-NEXT: vldrh.s32 q0, [r6] -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: vstrw.32 q2, [r5] +; CHECK-NEXT: vldrh.s32 q0, [r5] +; CHECK-NEXT: vmov.16 q5[6], r1 +; CHECK-NEXT: vmov.16 q5[7], r2 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r5, d0 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r5 -; CHECK-NEXT: vmov r2, r5, d5 -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload -; CHECK-NEXT: vadd.i16 q6, q6, q2 -; CHECK-NEXT: vadd.i16 q5, q5, q2 -; CHECK-NEXT: vadd.i16 q4, q4, q2 -; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: vmov.16 q7[0], r1 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov r1, r2, d13 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q3, q3, q6 +; CHECK-NEXT: vadd.i16 q1, q1, q6 +; CHECK-NEXT: vadd.i16 q2, q2, q6 +; CHECK-NEXT: ldrh.w r10, [r2] ; CHECK-NEXT: vmov r2, r4, d1 -; CHECK-NEXT: vldrh.s32 q0, [r6, #8] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q7[2], r9 +; CHECK-NEXT: vldrh.s32 q0, [r5, #8] +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov.16 q7[3], r5 +; CHECK-NEXT: vmov.16 q4[2], r1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.16 q7[4], r1 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov.16 q7[6], r12 -; CHECK-NEXT: vmov.16 q7[7], r11 +; CHECK-NEXT: vmov.16 q4[3], r10 +; CHECK-NEXT: vmov.16 q4[4], r9 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: vmov.16 q4[5], r8 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov.16 q4[7], r1 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.16 q3[3], r4 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.16 q7[3], r4 ; CHECK-NEXT: vmov r2, r4, d0 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmov.16 q7[5], r4 ; CHECK-NEXT: vmov r2, r4, d1 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: vmov.16 q3[7], r4 -; CHECK-NEXT: vadd.i16 q0, q3, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q7 -; CHECK-NEXT: vstrb.8 q0, [r7], #16 +; CHECK-NEXT: vmov.16 q7[6], r2 +; CHECK-NEXT: vmov.16 q7[7], r4 +; CHECK-NEXT: mov r4, r11 +; CHECK-NEXT: vadd.i16 q0, q7, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vstrb.8 q0, [r12], #16 ; CHECK-NEXT: le lr, .LBB14_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: bne.w .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -925,260 +924,246 @@ for.cond.cleanup: ; preds = %for.body, %middle.b define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: gather_inc_v16i8_complex: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB15_1: @ %vector.ph.preheader ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #312 -; CHECK-NEXT: sub sp, #312 -; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill -; CHECK-NEXT: blt.w .LBB15_5 -; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r1, .LCPI15_0 -; CHECK-NEXT: adr r6, .LCPI15_8 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI15_1 -; CHECK-NEXT: adr r7, .LCPI15_7 -; CHECK-NEXT: adr r3, .LCPI15_6 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI15_5 -; CHECK-NEXT: bic r10, r2, #7 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI15_9 -; CHECK-NEXT: vmov.i32 q2, #0x30 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: .pad #160 +; CHECK-NEXT: sub sp, #160 +; CHECK-NEXT: bic lr, r2, #7 +; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: vmov.i32 q0, #0x30 ; CHECK-NEXT: .LBB15_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB15_3 Depth 2 +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: adr r1, .LCPI15_1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI15_8 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: adr r1, .LCPI15_7 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI15_9 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI15_5 +; CHECK-NEXT: mov r9, lr +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI15_6 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: adr r1, .LCPI15_3 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q6, [r1] ; CHECK-NEXT: adr r1, .LCPI15_4 -; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vldrw.u32 q7, [r1] ; CHECK-NEXT: adr r1, .LCPI15_2 -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r1] ; CHECK-NEXT: adr r1, .LCPI15_10 -; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: adr r1, .LCPI15_11 -; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r1] -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: adr r1, .LCPI15_11 +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: .LBB15_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill -; CHECK-NEXT: vmov r1, lr, d8 -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vadd.i32 q6, q0, r0 -; CHECK-NEXT: vmov r6, r7, d13 -; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q6, r0 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q6, q3, r0 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q3, q1, r0 +; CHECK-NEXT: vmov r10, r1, d15 +; CHECK-NEXT: vmov r7, r11, d6 +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov r5, r3, d13 +; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: subs.w r9, r9, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: subs.w r11, r11, #16 -; CHECK-NEXT: ldrb.w r9, [r1] -; CHECK-NEXT: vmov r1, r3, d14 +; CHECK-NEXT: ldrb r6, [r1] +; CHECK-NEXT: ldrb r1, [r7] +; CHECK-NEXT: vmov r7, r4, d12 ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[0], r1 -; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[1], r1 -; CHECK-NEXT: vmov r1, r3, d12 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q6[0], r7 +; CHECK-NEXT: vmov.8 q6[1], r4 +; CHECK-NEXT: vmov.8 q6[2], r5 +; CHECK-NEXT: vmov r4, r5, d14 +; CHECK-NEXT: vmov.8 q6[3], r3 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r7, [r5] +; CHECK-NEXT: vmov.8 q7[0], r4 +; CHECK-NEXT: ldrb.w r5, [r10] +; CHECK-NEXT: vmov.8 q7[1], r7 +; CHECK-NEXT: ldrb.w r7, [r11] ; CHECK-NEXT: vmov.8 q7[2], r5 -; CHECK-NEXT: ldrb r5, [r6] -; CHECK-NEXT: ldrb r6, [r4] +; CHECK-NEXT: vmov r5, r10, d5 ; CHECK-NEXT: vmov.8 q7[3], r6 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[0], r1 -; CHECK-NEXT: vmov r6, r1, d2 -; CHECK-NEXT: vmov.8 q6[1], r3 -; CHECK-NEXT: vmov.8 q6[2], r5 -; CHECK-NEXT: vmov.8 q6[3], r7 -; CHECK-NEXT: ldrb.w r7, [lr] -; CHECK-NEXT: vmov.8 q6[4], r9 -; CHECK-NEXT: vmov.8 q6[5], r7 -; CHECK-NEXT: ldrb r4, [r1] -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, r3, d9 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[6], r1 -; CHECK-NEXT: vmov r1, r7, d0 -; CHECK-NEXT: vmov.8 q6[7], r3 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov r3, r4, d4 ; CHECK-NEXT: vmov.8 q7[4], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.8 q7[5], r7 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: ldrb r6, [r5] +; CHECK-NEXT: vmov r1, r5, d7 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q7[6], r1 -; CHECK-NEXT: ldrb r1, [r6] -; CHECK-NEXT: vmov r7, r6, d0 -; CHECK-NEXT: vmov.8 q7[7], r3 -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[8], r1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov r4, r1, d0 -; CHECK-NEXT: vmov.8 q7[10], r12 -; CHECK-NEXT: vmov.8 q7[11], r5 +; CHECK-NEXT: vmov r1, r7, d2 +; CHECK-NEXT: vmov.8 q7[7], r5 +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q6[4], r1 +; CHECK-NEXT: vmov r1, r5, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q6[5], r7 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q6[6], r1 +; CHECK-NEXT: ldrb r1, [r3] +; CHECK-NEXT: vmov.8 q6[7], r5 +; CHECK-NEXT: vmov r3, r7, d2 +; CHECK-NEXT: vmov.8 q6[8], r1 +; CHECK-NEXT: vmov r1, r11, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q6[9], r4 +; CHECK-NEXT: vmov.8 q6[10], r6 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: ldrb r4, [r7] +; CHECK-NEXT: ldrb.w r7, [r10] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q6[11], r7 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[8], r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov.8 q6[9], r1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q6[10], r5 -; CHECK-NEXT: vmov.8 q6[11], r4 -; CHECK-NEXT: vmov.8 q6[12], r7 -; CHECK-NEXT: vmov.8 q6[13], r6 -; CHECK-NEXT: vmov.8 q6[14], r3 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q7[8], r5 +; CHECK-NEXT: vmov r5, r7, d3 +; CHECK-NEXT: vmov.8 q7[9], r6 +; CHECK-NEXT: vadd.i32 q1, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, q0 +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q7[10], r5 +; CHECK-NEXT: vmov.8 q7[11], r7 +; CHECK-NEXT: vmov.8 q7[12], r3 +; CHECK-NEXT: vmov.8 q7[13], r4 +; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: vmov.8 q6[12], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov.8 q6[13], r1 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vadd.i32 q5, q5, q0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov.8 q6[14], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: ldrb.w r1, [lr] ; CHECK-NEXT: vmov.8 q6[15], r1 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vadd.i8 q6, q6, q7 +; CHECK-NEXT: ldrb.w r1, [r11] +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: vadd.i8 q6, q7, q6 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q7[0], r1 ; CHECK-NEXT: vmov.8 q7[1], r3 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[3], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[4], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[5], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vadd.i32 q5, q5, q2 -; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vadd.i32 q1, q4, r0 +; CHECK-NEXT: vadd.i32 q4, q4, q0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[7], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[8], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[9], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[11], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[12], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov r1, r3, d3 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: vadd.i8 q0, q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q7, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i8 q1, q6, q7 +; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q1, [r8], #16 +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q6, q6, q0 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill ; CHECK-NEXT: bne.w .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1 -; CHECK-NEXT: cmp r10, r2 +; CHECK-NEXT: cmp lr, r2 ; CHECK-NEXT: bne.w .LBB15_2 -; CHECK-NEXT: .LBB15_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #312 +; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI15_0: @@ -1287,102 +1272,95 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #56] @ 8-byte Folded Spill -; CHECK-NEXT: blt.w .LBB16_5 +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: blt .LBB16_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r5, .LCPI16_3 -; CHECK-NEXT: adr r7, .LCPI16_1 -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI16_0 -; CHECK-NEXT: adr r6, .LCPI16_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: bic r9, r1, #7 -; CHECK-NEXT: vldrw.u32 q3, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: mov.w lr, #16 -; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov.w r11, #16 +; CHECK-NEXT: bic r3, r1, #7 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB16_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 -; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: adr r1, .LCPI16_3 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI16_1 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: adr r1, .LCPI16_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 -; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr -; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr -; CHECK-NEXT: vadd.i32 q4, q4, lr -; CHECK-NEXT: vadd.i32 q5, q5, lr -; CHECK-NEXT: ldrb.w r11, [r3] -; CHECK-NEXT: ldrb r3, [r7] -; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q6, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov r4, r5, d13 +; CHECK-NEXT: vadd.i32 q3, q5, r11 +; CHECK-NEXT: vmov lr, r8, d4 +; CHECK-NEXT: subs r3, #16 +; CHECK-NEXT: vmov r6, r12, d5 +; CHECK-NEXT: vadd.i32 q2, q4, r11 +; CHECK-NEXT: vadd.i32 q1, q1, r11 +; CHECK-NEXT: vadd.i32 q0, q0, r11 +; CHECK-NEXT: ldrb.w r10, [r5] +; CHECK-NEXT: vmov r2, r5, d12 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vadd.i32 q5, q4, r0 +; CHECK-NEXT: ldrb.w r1, [r8] +; CHECK-NEXT: ldrb.w r9, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: ldrb.w r6, [lr] +; CHECK-NEXT: ldrb.w r12, [r12] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q4[0], r2 +; CHECK-NEXT: vmov.8 q4[1], r5 +; CHECK-NEXT: vmov r8, r5, d11 +; CHECK-NEXT: vmov.8 q4[2], r9 +; CHECK-NEXT: vmov.8 q4[3], r10 +; CHECK-NEXT: vmov.8 q4[4], r6 +; CHECK-NEXT: vmov.8 q4[5], r1 +; CHECK-NEXT: vmov.8 q4[6], r4 +; CHECK-NEXT: vmov r4, r6, d10 +; CHECK-NEXT: vmov.8 q4[7], r12 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: ldrb.w lr, [r5] +; CHECK-NEXT: vmov r5, r2, d13 ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w r10, [r10] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb.w r1, [r12] -; CHECK-NEXT: vmov.8 q0[0], r7 -; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 -; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: vmov.8 q0[4], r4 -; CHECK-NEXT: vmov r4, r2, d4 -; CHECK-NEXT: vmov.8 q0[5], r10 -; CHECK-NEXT: vmov.8 q0[6], r3 -; CHECK-NEXT: vmov.8 q0[7], r11 -; CHECK-NEXT: ldrb r6, [r7] -; CHECK-NEXT: vmov r5, r7, d5 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q4[8], r4 +; CHECK-NEXT: vmov.8 q4[9], r6 +; CHECK-NEXT: ldrb.w r9, [r2] +; CHECK-NEXT: vmov r1, r2, d12 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r10, [r2] +; CHECK-NEXT: ldrb.w r2, [r8] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r5] -; CHECK-NEXT: ldrb.w r12, [r7] -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q0[8], r4 -; CHECK-NEXT: vmov.8 q0[9], r7 -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: vmov.8 q0[11], r6 -; CHECK-NEXT: vmov.8 q0[12], r5 -; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vmov.8 q4[10], r2 +; CHECK-NEXT: vmov.8 q4[11], lr +; CHECK-NEXT: vmov.8 q4[12], r1 +; CHECK-NEXT: vmov.8 q4[13], r10 +; CHECK-NEXT: vmov.8 q4[14], r5 +; CHECK-NEXT: vmov.8 q4[15], r9 +; CHECK-NEXT: vstrb.8 q4, [r7], #16 +; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload -; CHECK-NEXT: cmp r9, r1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: bne .LBB16_2 ; CHECK-NEXT: .LBB16_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #64 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index 7b8b884576d13..82ec62ec9f7a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -602,60 +602,57 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [sp, #136] +; CHECK-NEXT: ldrne r0, [sp, #112] ; CHECK-NEXT: cmpne r0, #0 ; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r12, [sp, #140] +; CHECK-NEXT: ldr.w r12, [sp, #116] ; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r2, r12, #3 -; CHECK-NEXT: subs r3, r2, #4 -; CHECK-NEXT: add.w r0, r7, r3, lsr #2 -; CHECK-NEXT: ldr r7, [sp, #136] -; CHECK-NEXT: adr r3, .LCPI10_0 -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: lsl.w r0, r12, #1 -; CHECK-NEXT: vdup.32 q1, r7 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: lsls r6, r7, #1 -; CHECK-NEXT: vshl.i32 q3, q1, #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: bic r0, r12, #3 +; CHECK-NEXT: subs r3, r0, #4 +; CHECK-NEXT: add.w r3, r7, r3, lsr #2 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr r3, [sp, #112] +; CHECK-NEXT: lsl.w r7, r12, #1 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: vdup.32 q1, r3 +; CHECK-NEXT: lsls r6, r3, #1 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: b .LBB10_5 ; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r3, r0, r5, lsl #1 +; CHECK-NEXT: add.w r3, r2, r8, lsl #1 ; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4 ; CHECK-NEXT: b .LBB10_15 ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r11, r12 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add r3, r0 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: add r7, r12 +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add r5, r3 +; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: beq .LBB10_1 ; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -663,21 +660,22 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_15 Depth 2 -; CHECK-NEXT: mul r5, r3, r7 +; CHECK-NEXT: ldr r3, [sp, #112] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mul r8, r5, r3 ; CHECK-NEXT: beq .LBB10_3 ; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB10_8 ; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r8, r5 -; CHECK-NEXT: add.w r8, r8, #1 -; CHECK-NEXT: cmp r8, r7 -; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1] +; CHECK-NEXT: add.w r3, r9, r8 +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: strh.w r10, [r2, r3, lsl #1] +; CHECK-NEXT: ldr r3, [sp, #112] +; CHECK-NEXT: cmp r9, r3 ; CHECK-NEXT: beq .LBB10_4 ; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 @@ -692,46 +690,48 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: b .LBB10_13 ; CHECK-NEXT: .LBB10_10: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vmlas.i32 q5, q2, r8 -; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: adr r3, .LCPI10_0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: vmlas.i32 q4, q5, r9 +; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: .LBB10_11: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vadd.i32 q5, q4, q2 +; CHECK-NEXT: vldrh.s32 q6, [r1, q4, uxtw #1] +; CHECK-NEXT: vldrh.s32 q4, [r3], #8 +; CHECK-NEXT: vmul.i32 q4, q6, q4 +; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: vaddv.u32 r10, q4 -; CHECK-NEXT: cmp r2, r12 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: vaddv.u32 r10, q3 +; CHECK-NEXT: cmp r0, r12 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: mla r3, r7, r4, r8 -; CHECK-NEXT: add.w r0, r11, r4 -; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #112] ; CHECK-NEXT: sub.w lr, r12, r4 -; CHECK-NEXT: add.w r9, r7, r0, lsl #1 -; CHECK-NEXT: ldr r7, [sp, #136] -; CHECK-NEXT: add.w r3, r1, r3, lsl #1 +; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mla r3, r3, r4, r9 +; CHECK-NEXT: add.w r11, r1, r3, lsl #1 +; CHECK-NEXT: adds r3, r7, r4 +; CHECK-NEXT: add.w r3, r5, r3, lsl #1 ; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldrsh.w r4, [r3] -; CHECK-NEXT: add r3, r6 -; CHECK-NEXT: ldrsh r0, [r9], #2 -; CHECK-NEXT: smlabb r10, r4, r0, r10 +; CHECK-NEXT: ldrsh.w r5, [r11] +; CHECK-NEXT: add r11, r6 +; CHECK-NEXT: ldrsh r4, [r3], #2 +; CHECK-NEXT: smlabb r10, r5, r4, r10 ; CHECK-NEXT: le lr, .LBB10_14 ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 6633cec659d8e..9dc2d62f21d95 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -121,31 +121,29 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s7, s6, 31 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s0, s0, s4 -; CHECK-NEXT: s_addc_u32 s1, s1, s5 -; CHECK-NEXT: s_add_u32 s4, s0, -8 -; CHECK-NEXT: s_addc_u32 s5, s1, -1 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: s_add_u32 s4, s0, s4 +; CHECK-NEXT: s_addc_u32 s5, s1, s5 +; CHECK-NEXT: s_add_u32 s0, s4, -8 +; CHECK-NEXT: s_addc_u32 s1, s5, -1 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 9 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; CHECK-NEXT: .LBB3_1: ; %bb0 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 -; CHECK-NEXT: s_mov_b64 s[0:1], exec -; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: s_cbranch_execz .LBB3_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 ; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: .LBB3_4: