-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[MachineLICM] Use RegisterClassInfo::getRegPressureSetLimit
#119826
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MachineLICM] Use RegisterClassInfo::getRegPressureSetLimit
#119826
Conversation
@llvm/pr-subscribers-backend-loongarch @llvm/pr-subscribers-llvm-transforms Author: Pengcheng Wang (wangpc-pp) Changes
It seems that we shouldn't use Separate from #118787 Patch is 5.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119826.diff 50 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..798c3461094a8d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,6 +124,7 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
+ RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+ RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 23f24a9dc9982a..bd2bbb97983122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, ...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Pengcheng Wang (wangpc-pp) Changes
It seems that we shouldn't use Separate from #118787 Patch is 5.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119826.diff 50 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..798c3461094a8d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,6 +124,7 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
+ RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+ RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 23f24a9dc9982a..bd2bbb97983122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, ...
[truncated]
|
@llvm/pr-subscribers-backend-nvptx Author: Pengcheng Wang (wangpc-pp) Changes
It seems that we shouldn't use Separate from #118787 Patch is 5.10 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119826.diff 50 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d1d5509dc482a2..798c3461094a8d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -124,6 +124,7 @@ namespace {
const TargetRegisterInfo *TRI = nullptr;
const MachineFrameInfo *MFI = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;
bool PreRegAlloc = false;
bool HasProfileData = false;
@@ -392,6 +393,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
MFI = &MF.getFrameInfo();
MRI = &MF.getRegInfo();
SchedModel.init(&ST);
+ RegClassInfo.runOnMachineFunction(MF);
HasProfileData = MF.getFunction().hasProfileData();
@@ -408,7 +410,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
std::fill(RegPressure.begin(), RegPressure.end(), 0);
RegLimit.resize(NumRPS);
for (unsigned i = 0, e = NumRPS; i != e; ++i)
- RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+ RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i);
}
if (HoistConstLoads)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 23f24a9dc9982a..bd2bbb97983122 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -325,13 +325,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -370,13 +370,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -394,13 +394,13 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v5, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v3, v5, v3
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -469,21 +469,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v3, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB5_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -513,20 +513,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v3, v4
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -536,20 +536,20 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT: v_max_f32_e32 v5, v2, v2
+; GFX908-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB5_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -602,15 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[8:9], v[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -640,15 +640,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -686,15 +686,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
@@ -712,15 +712,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[8:9], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[4:5], v[8:9], v[4:5]
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -758,21 +758,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[8:9], v[2:3], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[8:9]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -795,22 +795,22 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -840,21 +840,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v5, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -864,21 +864,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3]
+; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[8:9]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -918,13 +918,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1]
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_max_f32_e32 v2, v2, v2
; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX940-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX940-NEXT: v_max_f32_e32 v4, v4, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,13 +963,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT: v_max_f32_e32 v3, v2, v2
+; GFX90A-NEXT: v_max_f32_e32 v4, v5, v5
+; GFX90A-NEXT: v_max_f32_e32 v4, v4, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
@@ -987,13 +987,13 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_max_f32_e32 v2, v2, ...
[truncated]
|
Ping. |
1 similar comment
Ping. |
`RegisterClassInfo::getRegPressureSetLimit` is a wrapper of `TargetRegisterInfo::getRegPressureSetLimit` with some logics to adjust the limit by removing reserved registers. It seems that we shouldn't use `TargetRegisterInfo::getRegPressureSetLimit` directly, just like the comment "This limit must be adjusted dynamically for reserved registers" said. Separate from llvm#118787
922617d
to
0cee5a6
Compare
I've reverted this in eeac0ff, because it causes a very large compile-time regression, see https://llvm-compile-time-tracker.com/compare.php?from=e3e26dc41a6ad78c35a1a723cd77f5db8599797d&to=b4e17d4a314ed87ff6b40b4b05397d4b25b6636a&stat=instructions:u. If you need a test case for analysis, a good one is probably k.cc from kimwitu++, which regresses by more than 6%. |
Thanks for reporting, I have no surprise about compile-time increase. I was going to make |
…llvm#119826)" This reverts commit b4e17d4. This causes a large compile-time regression.
RegisterClassInfo::getRegPressureSetLimit
is a wrapper ofTargetRegisterInfo::getRegPressureSetLimit
with some logics toadjust the limit by removing reserved registers.
It seems that we shouldn't use
TargetRegisterInfo::getRegPressureSetLimit
directly, just like the comment "This limit must be adjusted
dynamically for reserved registers" said.
Separate from #118787